1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CONTEXT_TRACKING_H #define _LINUX_CONTEXT_TRACKING_H #include <linux/sched.h> #include <linux/vtime.h> #include <linux/context_tracking_state.h> #include <linux/instrumentation.h> #include <asm/ptrace.h> #ifdef CONFIG_CONTEXT_TRACKING_USER extern void ct_cpu_track_user(int cpu); /* Called with interrupts disabled. */ extern void __ct_user_enter(enum ctx_state state); extern void __ct_user_exit(enum ctx_state state); extern void ct_user_enter(enum ctx_state state); extern void ct_user_exit(enum ctx_state state); extern void user_enter_callable(void); extern void user_exit_callable(void); static inline void user_enter(void) { if (context_tracking_enabled()) ct_user_enter(CONTEXT_USER); } static inline void user_exit(void) { if (context_tracking_enabled()) ct_user_exit(CONTEXT_USER); } /* Called with interrupts disabled. */ static __always_inline void user_enter_irqoff(void) { if (context_tracking_enabled()) __ct_user_enter(CONTEXT_USER); } static __always_inline void user_exit_irqoff(void) { if (context_tracking_enabled()) __ct_user_exit(CONTEXT_USER); } static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) || !context_tracking_enabled()) return 0; prev_ctx = __ct_state(); if (prev_ctx != CONTEXT_KERNEL) ct_user_exit(prev_ctx); return prev_ctx; } static inline void exception_exit(enum ctx_state prev_ctx) { if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) && context_tracking_enabled()) { if (prev_ctx != CONTEXT_KERNEL) ct_user_enter(prev_ctx); } } static __always_inline bool context_tracking_guest_enter(void) { if (context_tracking_enabled()) __ct_user_enter(CONTEXT_GUEST); return context_tracking_enabled_this_cpu(); } static __always_inline void context_tracking_guest_exit(void) { if (context_tracking_enabled()) __ct_user_exit(CONTEXT_GUEST); } #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) #else static inline void user_enter(void) { } static inline void user_exit(void) { } static inline void user_enter_irqoff(void) { } static inline void user_exit_irqoff(void) { } static inline int exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline int ct_state(void) { return -1; } static inline int __ct_state(void) { return -1; } static __always_inline bool context_tracking_guest_enter(void) { return false; } static __always_inline void context_tracking_guest_exit(void) { } #define CT_WARN_ON(cond) do { } while (0) #endif /* !CONFIG_CONTEXT_TRACKING_USER */ #ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE extern void context_tracking_init(void); #else static inline void context_tracking_init(void) { } #endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE */ #ifdef CONFIG_CONTEXT_TRACKING_IDLE extern void ct_idle_enter(void); extern void ct_idle_exit(void); /* * Is the current CPU in an extended quiescent state? * * No ordering, as we are sampling CPU-local information. */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & RCU_DYNTICKS_IDX); } /* * Increment the current CPU's context_tracking structure's ->state field * with ordering. Return the new value. */ static __always_inline unsigned long ct_state_inc(int incby) { return raw_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state)); } static __always_inline bool warn_rcu_enter(void) { bool ret = false; /* * Horrible hack to shut up recursive RCU isn't watching fail since * lots of the actual reporting also relies on RCU. */ preempt_disable_notrace(); if (rcu_dynticks_curr_cpu_in_eqs()) { ret = true; ct_state_inc(RCU_DYNTICKS_IDX); } return ret; } static __always_inline void warn_rcu_exit(bool rcu) { if (rcu) ct_state_inc(RCU_DYNTICKS_IDX); preempt_enable_notrace(); } #else static inline void ct_idle_enter(void) { } static inline void ct_idle_exit(void) { } static __always_inline bool warn_rcu_enter(void) { return false; } static __always_inline void warn_rcu_exit(bool rcu) { } #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */ #endif
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* * linux/drivers/video/fb_notify.c * * Copyright (C) 2006 Antonino Daplas <adaplas@pol.net> * * 2001 - Documented with DocBook * - Brad Douglas <brad@neruo.com> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. */ #include <linux/fb.h> #include <linux/notifier.h> #include <linux/export.h> static BLOCKING_NOTIFIER_HEAD(fb_notifier_list); /** * fb_register_client - register a client notifier * @nb: notifier block to callback on events * * Return: 0 on success, negative error code on failure. */ int fb_register_client(struct notifier_block *nb) { return blocking_notifier_chain_register(&fb_notifier_list, nb); } EXPORT_SYMBOL(fb_register_client); /** * fb_unregister_client - unregister a client notifier * @nb: notifier block to callback on events * * Return: 0 on success, negative error code on failure. */ int fb_unregister_client(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&fb_notifier_list, nb); } EXPORT_SYMBOL(fb_unregister_client); /** * fb_notifier_call_chain - notify clients of fb_events * @val: value passed to callback * @v: pointer passed to callback * * Return: The return value of the last notifier function */ int fb_notifier_call_chain(unsigned long val, void *v) { return blocking_notifier_call_chain(&fb_notifier_list, val, v); } EXPORT_SYMBOL_GPL(fb_notifier_call_chain);
381 381 368 23 3 380 7 7 806 251 245 250 411 118 329 5 1 5 357 357 84 284 71 71 329 328 21 10 8 2 5 10 304 306 304 19 381 87 304 12 76 173 143 37 2 170 306 3 382 10 9 384 384 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2003 * Copyright (c) Cisco 1999,2000 * Copyright (c) Motorola 1999,2000,2001 * Copyright (c) La Monte H.P. Yarroll 2001 * * This file is part of the SCTP kernel implementation. * * A collection class to handle the storage of transport addresses. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags); static void sctp_bind_addr_clean(struct sctp_bind_addr *); /* First Level Abstractions. */ /* Copy 'src' to 'dest' taking 'scope' into account. Omit addresses * in 'src' which have a broader scope than 'scope'. */ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, enum sctp_scope scope, gfp_t gfp, int flags) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; /* Extract the addresses which are relevant for this scope. */ list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, scope, gfp, flags); if (error < 0) goto out; } /* If there are no addresses matching the scope and * this is global scope, try to get a link scope address, with * the assumption that we must be sitting behind a NAT. */ if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) { list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, SCTP_SCOPE_LINK, gfp, flags); if (error < 0) goto out; } } /* If somehow no addresses were found that can be used with this * scope, it's an error. */ if (list_empty(&dest->address_list)) error = -ENETUNREACH; out: if (error) sctp_bind_addr_clean(dest); return error; } /* Exactly duplicate the address lists. This is necessary when doing * peer-offs and accepts. We don't want to put all the current system * addresses into the endpoint. That's useless. But we do want duplicat * the list of bound addresses that the older endpoint used. */ int sctp_bind_addr_dup(struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, gfp_t gfp) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; list_for_each_entry(addr, &src->address_list, list) { error = sctp_add_bind_addr(dest, &addr->a, sizeof(addr->a), 1, gfp); if (error < 0) break; } return error; } /* Initialize the SCTP_bind_addr structure for either an endpoint or * an association. */ void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port) { INIT_LIST_HEAD(&bp->address_list); bp->port = port; } /* Dispose of the address list. */ static void sctp_bind_addr_clean(struct sctp_bind_addr *bp) { struct sctp_sockaddr_entry *addr, *temp; /* Empty the bind address list. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { list_del_rcu(&addr->list); kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); } } /* Dispose of an SCTP_bind_addr structure */ void sctp_bind_addr_free(struct sctp_bind_addr *bp) { /* Empty the bind address list. */ sctp_bind_addr_clean(bp); } /* Add an address to the bind address list in the SCTP_bind_addr structure. */ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, int new_size, __u8 addr_state, gfp_t gfp) { struct sctp_sockaddr_entry *addr; /* Add the address to the bind address list. */ addr = kzalloc(sizeof(*addr), gfp); if (!addr) return -ENOMEM; memcpy(&addr->a, new, min_t(size_t, sizeof(*new), new_size)); /* Fix up the port if it has not yet been set. * Both v4 and v6 have the port at the same offset. */ if (!addr->a.v4.sin_port) addr->a.v4.sin_port = htons(bp->port); addr->state = addr_state; addr->valid = 1; INIT_LIST_HEAD(&addr->list); /* We always hold a socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_add_tail_rcu(&addr->list, &bp->address_list); SCTP_DBG_OBJCNT_INC(addr); return 0; } /* Delete an address from the bind address list in the SCTP_bind_addr * structure. */ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr) { struct sctp_sockaddr_entry *addr, *temp; int found = 0; /* We hold the socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { if (sctp_cmp_addr_exact(&addr->a, del_addr)) { /* Found the exact match. */ found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } if (found) { kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); return 0; } return -EINVAL; } /* Create a network byte-order representation of all the addresses * formated as SCTP parameters. * * The second argument is the return value for the length. */ union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, int *addrs_len, gfp_t gfp) { union sctp_params addrparms; union sctp_params retval; int addrparms_len; union sctp_addr_param rawaddr; int len; struct sctp_sockaddr_entry *addr; struct list_head *pos; struct sctp_af *af; addrparms_len = 0; len = 0; /* Allocate enough memory at once. */ list_for_each(pos, &bp->address_list) { len += sizeof(union sctp_addr_param); } /* Don't even bother embedding an address if there * is only one. */ if (len == sizeof(union sctp_addr_param)) { retval.v = NULL; goto end_raw; } retval.v = kmalloc(len, gfp); if (!retval.v) goto end_raw; addrparms = retval; list_for_each_entry(addr, &bp->address_list, list) { af = sctp_get_af_specific(addr->a.v4.sin_family); len = af->to_addr_param(&addr->a, &rawaddr); memcpy(addrparms.v, &rawaddr, len); addrparms.v += len; addrparms_len += len; } end_raw: *addrs_len = addrparms_len; return retval; } /* * Create an address list out of the raw address list format (IPv4 and IPv6 * address parameters). */ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, int addrs_len, __u16 port, gfp_t gfp) { union sctp_addr_param *rawaddr; struct sctp_paramhdr *param; union sctp_addr addr; int retval = 0; int len; struct sctp_af *af; /* Convert the raw address to standard address format */ while (addrs_len) { param = (struct sctp_paramhdr *)raw_addr_list; rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); if (unlikely(!af) || !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; goto out_err; } if (sctp_bind_addr_state(bp, &addr) != -1) goto next; retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), SCTP_ADDR_SRC, gfp); if (retval) /* Can't finish building the list, clean up. */ goto out_err; next: len = ntohs(param->length); addrs_len -= len; raw_addr_list += len; } return retval; out_err: if (retval) sctp_bind_addr_clean(bp); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Does this contain a specified address? Allow wildcarding. */ int sctp_bind_addr_match(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; int match = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) { match = 1; break; } } rcu_read_unlock(); return match; } int sctp_bind_addrs_check(struct sctp_sock *sp, struct sctp_sock *sp2, int cnt2) { struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; struct sctp_sockaddr_entry *laddr, *laddr2; bool exist = false; int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && laddr->valid && laddr2->valid) { exist = true; goto next; } } cnt = 0; break; next: cnt++; } rcu_read_unlock(); return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); } /* Does the address 'addr' conflict with any addresses in * the bp. */ int sctp_bind_addr_conflict(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *bp_sp, struct sctp_sock *addr_sp) { struct sctp_sockaddr_entry *laddr; int conflict = 0; struct sctp_sock *sp; /* Pick the IPv6 socket as the basis of comparison * since it's usually a superset of the IPv4. * If there is no IPv6 socket, then default to bind_addr. */ if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6) sp = bp_sp; else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6) sp = addr_sp; else sp = bp_sp; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; conflict = sp->pf->cmp_addr(&laddr->a, addr, sp); if (conflict) break; } rcu_read_unlock(); return conflict; } /* Get the state of the entry in the bind_addr_list */ int sctp_bind_addr_state(const struct sctp_bind_addr *bp, const union sctp_addr *addr) { struct sctp_sockaddr_entry *laddr; struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (unlikely(!af)) return -1; list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (af->cmp_addr(&laddr->a, addr)) return laddr->state; } return -1; } /* Find the first address in the bind address list that is not present in * the addrs packed array. */ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, const union sctp_addr *addrs, int addrcnt, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; union sctp_addr *addr; void *addr_buf; struct sctp_af *af; int i; /* This is only called sctp_send_asconf_del_ip() and we hold * the socket lock in that code patch, so that address list * can't change. */ list_for_each_entry(laddr, &bp->address_list, list) { addr_buf = (union sctp_addr *)addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); if (!af) break; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) break; addr_buf += af->sockaddr_len; } if (i == addrcnt) return &laddr->a; } return NULL; } /* Copy out addresses from the global local address list. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags) { int error = 0; if (sctp_is_any(NULL, addr)) { error = sctp_copy_local_addr_list(net, dest, scope, gfp, flags); } else if (sctp_in_scope(net, addr, scope)) { /* Now that the address is in scope, check to see if * the address type is supported by local sock as * well as the remote peer. */ if ((((AF_INET == addr->sa.sa_family) && (flags & SCTP_ADDR4_ALLOWED) && (flags & SCTP_ADDR4_PEERSUPP))) || (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && (flags & SCTP_ADDR6_PEERSUPP)))) error = sctp_add_bind_addr(dest, addr, sizeof(*addr), SCTP_ADDR_SRC, gfp); } return error; } /* Is this a wildcard address? */ int sctp_is_any(struct sock *sk, const union sctp_addr *addr) { unsigned short fam = 0; struct sctp_af *af; /* Try to get the right address family */ if (addr->sa.sa_family != AF_UNSPEC) fam = addr->sa.sa_family; else if (sk) fam = sk->sk_family; af = sctp_get_af_specific(fam); if (!af) return 0; return af->is_any(addr); } /* Is 'addr' valid for 'scope'? */ int sctp_in_scope(struct net *net, const union sctp_addr *addr, enum sctp_scope scope) { enum sctp_scope addr_scope = sctp_scope(addr); /* The unusable SCTP addresses will not be considered with * any defined scopes. */ if (SCTP_SCOPE_UNUSABLE == addr_scope) return 0; /* * For INIT and INIT-ACK address list, let L be the level of * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * * Address scoping can be selectively controlled via sysctl * option */ switch (net->sctp.scope_policy) { case SCTP_SCOPE_POLICY_DISABLE: return 1; case SCTP_SCOPE_POLICY_ENABLE: if (addr_scope <= scope) return 1; break; case SCTP_SCOPE_POLICY_PRIVATE: if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope) return 1; break; case SCTP_SCOPE_POLICY_LINK: if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope) return 1; break; default: break; } return 0; } int sctp_is_ep_boundall(struct sock *sk) { struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *addr; bp = &sctp_sk(sk)->ep->base.bind_addr; if (sctp_list_single_entry(&bp->address_list)) { addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(sk, &addr->a)) return 1; } return 0; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ /* What is the scope of 'addr'? */ enum sctp_scope sctp_scope(const union sctp_addr *addr) { struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (!af) return SCTP_SCOPE_UNUSABLE; return af->scope((union sctp_addr *)addr); }
4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM f2fs #if !defined(_TRACE_F2FS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_F2FS_H #include <linux/tracepoint.h> #include <uapi/linux/f2fs.h> #define show_dev(dev) MAJOR(dev), MINOR(dev) #define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(HOT); TRACE_DEFINE_ENUM(WARM); TRACE_DEFINE_ENUM(COLD); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); TRACE_DEFINE_ENUM(CURSEG_WARM_DATA); TRACE_DEFINE_ENUM(CURSEG_COLD_DATA); TRACE_DEFINE_ENUM(CURSEG_HOT_NODE); TRACE_DEFINE_ENUM(CURSEG_WARM_NODE); TRACE_DEFINE_ENUM(CURSEG_COLD_NODE); TRACE_DEFINE_ENUM(NO_CHECK_TYPE); TRACE_DEFINE_ENUM(GC_GREEDY); TRACE_DEFINE_ENUM(GC_CB); TRACE_DEFINE_ENUM(FG_GC); TRACE_DEFINE_ENUM(BG_GC); TRACE_DEFINE_ENUM(LFS); TRACE_DEFINE_ENUM(SSR); TRACE_DEFINE_ENUM(__REQ_RAHEAD); TRACE_DEFINE_ENUM(__REQ_SYNC); TRACE_DEFINE_ENUM(__REQ_IDLE); TRACE_DEFINE_ENUM(__REQ_PREFLUSH); TRACE_DEFINE_ENUM(__REQ_FUA); TRACE_DEFINE_ENUM(__REQ_PRIO); TRACE_DEFINE_ENUM(__REQ_META); TRACE_DEFINE_ENUM(CP_UMOUNT); TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); TRACE_DEFINE_ENUM(CP_TRIMMED); TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); TRACE_DEFINE_ENUM(EX_READ); TRACE_DEFINE_ENUM(EX_BLOCK_AGE); #define show_block_type(type) \ __print_symbolic(type, \ { NODE, "NODE" }, \ { DATA, "DATA" }, \ { META, "META" }, \ { META_FLUSH, "META_FLUSH" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) #define show_block_temp(temp) \ __print_symbolic(temp, \ { HOT, "HOT" }, \ { WARM, "WARM" }, \ { COLD, "COLD" }) #define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_META | REQ_PRIO | \ REQ_PREFLUSH | REQ_FUA) #define F2FS_BIO_FLAG_MASK(t) (__force u32)((t) & F2FS_OP_FLAGS) #define show_bio_type(op,op_flags) show_bio_op(op), \ show_bio_op_flags(op_flags) #define show_bio_op(op) blk_op_str(op) #define show_bio_op_flags(flags) \ __print_flags(F2FS_BIO_FLAG_MASK(flags), "|", \ { (__force u32)REQ_RAHEAD, "R" }, \ { (__force u32)REQ_SYNC, "S" }, \ { (__force u32)REQ_META, "M" }, \ { (__force u32)REQ_PRIO, "P" }, \ { (__force u32)REQ_PREFLUSH, "PF" }, \ { (__force u32)REQ_FUA, "FUA" }) #define show_data_type(type) \ __print_symbolic(type, \ { CURSEG_HOT_DATA, "Hot DATA" }, \ { CURSEG_WARM_DATA, "Warm DATA" }, \ { CURSEG_COLD_DATA, "Cold DATA" }, \ { CURSEG_HOT_NODE, "Hot NODE" }, \ { CURSEG_WARM_NODE, "Warm NODE" }, \ { CURSEG_COLD_NODE, "Cold NODE" }, \ { NO_CHECK_TYPE, "No TYPE" }) #define show_file_type(type) \ __print_symbolic(type, \ { 0, "FILE" }, \ { 1, "DIR" }) #define show_gc_type(type) \ __print_symbolic(type, \ { FG_GC, "Foreground GC" }, \ { BG_GC, "Background GC" }) #define show_alloc_mode(type) \ __print_symbolic(type, \ { LFS, "LFS-mode" }, \ { SSR, "SSR-mode" }, \ { AT_SSR, "AT_SSR-mode" }) #define show_victim_policy(type) \ __print_symbolic(type, \ { GC_GREEDY, "Greedy" }, \ { GC_CB, "Cost-Benefit" }, \ { GC_AT, "Age-threshold" }) #define show_cpreason(type) \ __print_flags(type, "|", \ { CP_UMOUNT, "Umount" }, \ { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ { CP_DISCARD, "Discard" }, \ { CP_PAUSE, "Pause" }, \ { CP_TRIMMED, "Trimmed" }, \ { CP_RESIZE, "Resize" }) #define show_fsync_cpreason(type) \ __print_symbolic(type, \ { CP_NO_NEEDED, "no needed" }, \ { CP_NON_REGULAR, "non regular" }, \ { CP_COMPRESSED, "compressed" }, \ { CP_HARDLINK, "hardlink" }, \ { CP_SB_NEED_CP, "sb needs cp" }, \ { CP_WRONG_PINO, "wrong pino" }, \ { CP_NO_SPC_ROLL, "no space roll forward" }, \ { CP_NODE_NEED_CP, "node needs cp" }, \ { CP_FASTBOOT_MODE, "fastboot mode" }, \ { CP_SPEC_LOG_NUM, "log type is 2" }, \ { CP_RECOVER_DIR, "dir needs recovery" }) #define show_shutdown_mode(type) \ __print_symbolic(type, \ { F2FS_GOING_DOWN_FULLSYNC, "full sync" }, \ { F2FS_GOING_DOWN_METASYNC, "meta sync" }, \ { F2FS_GOING_DOWN_NOSYNC, "no sync" }, \ { F2FS_GOING_DOWN_METAFLUSH, "meta flush" }, \ { F2FS_GOING_DOWN_NEED_FSCK, "need fsck" }) #define show_compress_algorithm(type) \ __print_symbolic(type, \ { COMPRESS_LZO, "LZO" }, \ { COMPRESS_LZ4, "LZ4" }, \ { COMPRESS_ZSTD, "ZSTD" }, \ { COMPRESS_LZORLE, "LZO-RLE" }) #define show_extent_type(type) \ __print_symbolic(type, \ { EX_READ, "Read" }, \ { EX_BLOCK_AGE, "Block Age" }) struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; struct victim_sel_policy; struct f2fs_map_blocks; DECLARE_EVENT_CLASS(f2fs__inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(ino_t, pino) __field(umode_t, mode) __field(loff_t, size) __field(unsigned int, nlink) __field(blkcnt_t, blocks) __field(__u8, advise) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pino = F2FS_I(inode)->i_pino; __entry->mode = inode->i_mode; __entry->nlink = inode->i_nlink; __entry->size = inode->i_size; __entry->blocks = inode->i_blocks; __entry->advise = F2FS_I(inode)->i_advise; ), TP_printk("dev = (%d,%d), ino = %lu, pino = %lu, i_mode = 0x%hx, " "i_size = %lld, i_nlink = %u, i_blocks = %llu, i_advise = 0x%x", show_dev_ino(__entry), (unsigned long)__entry->pino, __entry->mode, __entry->size, (unsigned int)__entry->nlink, (unsigned long long)__entry->blocks, (unsigned char)__entry->advise) ); DECLARE_EVENT_CLASS(f2fs__inode_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, ret = %d", show_dev_ino(__entry), __entry->ret) ); DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); TRACE_EVENT(f2fs_sync_file_exit, TP_PROTO(struct inode *inode, int cp_reason, int datasync, int ret), TP_ARGS(inode, cp_reason, datasync, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, cp_reason) __field(int, datasync) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->cp_reason = cp_reason; __entry->datasync = datasync; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, " "datasync = %d, ret = %d", show_dev_ino(__entry), show_fsync_cpreason(__entry->cp_reason), __entry->datasync, __entry->ret) ); TRACE_EVENT(f2fs_sync_fs, TP_PROTO(struct super_block *sb, int wait), TP_ARGS(sb, wait), TP_STRUCT__entry( __field(dev_t, dev) __field(int, dirty) __field(int, wait) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->dirty = is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY); __entry->wait = wait; ), TP_printk("dev = (%d,%d), superblock is %s, wait = %d", show_dev(__entry->dev), __entry->dirty ? "dirty" : "not dirty", __entry->wait) ); DEFINE_EVENT(f2fs__inode, f2fs_iget, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_iget_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__inode, f2fs_evict_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_new_inode, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); TRACE_EVENT(f2fs_unlink_enter, TP_PROTO(struct inode *dir, struct dentry *dentry), TP_ARGS(dir, dentry), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, size) __field(blkcnt_t, blocks) __string(name, dentry->d_name.name) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; __entry->size = dir->i_size; __entry->blocks = dir->i_blocks; __assign_str(name, dentry->d_name.name); ), TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, " "i_blocks = %llu, name = %s", show_dev_ino(__entry), __entry->size, (unsigned long long)__entry->blocks, __get_str(name)) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__inode, f2fs_truncate, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); TRACE_EVENT(f2fs_truncate_data_blocks_range, TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs, int free), TP_ARGS(inode, nid, ofs, free), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(nid_t, nid) __field(unsigned int, ofs) __field(int, free) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nid = nid; __entry->ofs = ofs; __entry->free = free; ), TP_printk("dev = (%d,%d), ino = %lu, nid = %u, offset = %u, freed = %d", show_dev_ino(__entry), (unsigned int)__entry->nid, __entry->ofs, __entry->free) ); DECLARE_EVENT_CLASS(f2fs__truncate_op, TP_PROTO(struct inode *inode, u64 from), TP_ARGS(inode, from), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, size) __field(blkcnt_t, blocks) __field(u64, from) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->size = inode->i_size; __entry->blocks = inode->i_blocks; __entry->from = from; ), TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld, i_blocks = %llu, " "start file offset = %llu", show_dev_ino(__entry), __entry->size, (unsigned long long)__entry->blocks, (unsigned long long)__entry->from) ); DEFINE_EVENT(f2fs__truncate_op, f2fs_truncate_blocks_enter, TP_PROTO(struct inode *inode, u64 from), TP_ARGS(inode, from) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_blocks_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__truncate_op, f2fs_truncate_inode_blocks_enter, TP_PROTO(struct inode *inode, u64 from), TP_ARGS(inode, from) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_inode_blocks_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DECLARE_EVENT_CLASS(f2fs__truncate_node, TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr), TP_ARGS(inode, nid, blk_addr), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(nid_t, nid) __field(block_t, blk_addr) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nid = nid; __entry->blk_addr = blk_addr; ), TP_printk("dev = (%d,%d), ino = %lu, nid = %u, block_address = 0x%llx", show_dev_ino(__entry), (unsigned int)__entry->nid, (unsigned long long)__entry->blk_addr) ); DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_nodes_enter, TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr), TP_ARGS(inode, nid, blk_addr) ); DEFINE_EVENT(f2fs__inode_exit, f2fs_truncate_nodes_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret) ); DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_node, TP_PROTO(struct inode *inode, nid_t nid, block_t blk_addr), TP_ARGS(inode, nid, blk_addr) ); TRACE_EVENT(f2fs_truncate_partial_nodes, TP_PROTO(struct inode *inode, nid_t *nid, int depth, int err), TP_ARGS(inode, nid, depth, err), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __array(nid_t, nid, 3) __field(int, depth) __field(int, err) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nid[0] = nid[0]; __entry->nid[1] = nid[1]; __entry->nid[2] = nid[2]; __entry->depth = depth; __entry->err = err; ), TP_printk("dev = (%d,%d), ino = %lu, " "nid[0] = %u, nid[1] = %u, nid[2] = %u, depth = %d, err = %d", show_dev_ino(__entry), (unsigned int)__entry->nid[0], (unsigned int)__entry->nid[1], (unsigned int)__entry->nid[2], __entry->depth, __entry->err) ); TRACE_EVENT(f2fs_file_write_iter, TP_PROTO(struct inode *inode, loff_t offset, size_t length, ssize_t ret), TP_ARGS(inode, offset, length, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(size_t, length) __field(ssize_t, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->length = length; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, " "offset = %lld, length = %zu, written(err) = %zd", show_dev_ino(__entry), __entry->offset, __entry->length, __entry->ret) ); TRACE_EVENT(f2fs_map_blocks, TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int flag, int ret), TP_ARGS(inode, map, flag, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(block_t, m_lblk) __field(block_t, m_pblk) __field(unsigned int, m_len) __field(unsigned int, m_flags) __field(int, m_seg_type) __field(bool, m_may_create) __field(bool, m_multidev_dio) __field(int, flag) __field(int, ret) ), TP_fast_assign( __entry->dev = map->m_bdev->bd_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; __entry->m_len = map->m_len; __entry->m_flags = map->m_flags; __entry->m_seg_type = map->m_seg_type; __entry->m_may_create = map->m_may_create; __entry->m_multidev_dio = map->m_multidev_dio; __entry->flag = flag; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " "start blkaddr = 0x%llx, len = 0x%llx, flags = %u, " "seg_type = %d, may_create = %d, multidevice = %d, " "flag = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, (unsigned long long)__entry->m_len, __entry->m_flags, __entry->m_seg_type, __entry->m_may_create, __entry->m_multidev_dio, __entry->flag, __entry->ret) ); TRACE_EVENT(f2fs_background_gc, TP_PROTO(struct super_block *sb, unsigned int wait_ms, unsigned int prefree, unsigned int free), TP_ARGS(sb, wait_ms, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, wait_ms) __field(unsigned int, prefree) __field(unsigned int, free) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->wait_ms = wait_ms; __entry->prefree = prefree; __entry->free = free; ), TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u", show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, __entry->free) ); TRACE_EVENT(f2fs_gc_begin, TP_PROTO(struct super_block *sb, int gc_type, bool no_bg_gc, unsigned int nr_free_secs, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), TP_ARGS(sb, gc_type, no_bg_gc, nr_free_secs, dirty_nodes, dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, gc_type) __field(bool, no_bg_gc) __field(unsigned int, nr_free_secs) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) __field(unsigned int, free_sec) __field(unsigned int, free_seg) __field(int, reserved_seg) __field(unsigned int, prefree_seg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->gc_type = gc_type; __entry->no_bg_gc = no_bg_gc; __entry->nr_free_secs = nr_free_secs; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; __entry->free_sec = free_sec; __entry->free_seg = free_seg; __entry->reserved_seg = reserved_seg; __entry->prefree_seg = prefree_seg; ), TP_printk("dev = (%d,%d), gc_type = %s, no_background_GC = %d, nr_free_secs = %u, " "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " "rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), show_gc_type(__entry->gc_type), (__entry->gc_type == BG_GC) ? __entry->no_bg_gc : -1, __entry->nr_free_secs, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, __entry->free_sec, __entry->free_seg, __entry->reserved_seg, __entry->prefree_seg) ); TRACE_EVENT(f2fs_gc_end, TP_PROTO(struct super_block *sb, int ret, int seg_freed, int sec_freed, long long dirty_nodes, long long dirty_dents, long long dirty_imeta, unsigned int free_sec, unsigned int free_seg, int reserved_seg, unsigned int prefree_seg), TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents, dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, ret) __field(int, seg_freed) __field(int, sec_freed) __field(long long, dirty_nodes) __field(long long, dirty_dents) __field(long long, dirty_imeta) __field(unsigned int, free_sec) __field(unsigned int, free_seg) __field(int, reserved_seg) __field(unsigned int, prefree_seg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ret = ret; __entry->seg_freed = seg_freed; __entry->sec_freed = sec_freed; __entry->dirty_nodes = dirty_nodes; __entry->dirty_dents = dirty_dents; __entry->dirty_imeta = dirty_imeta; __entry->free_sec = free_sec; __entry->free_seg = free_seg; __entry->reserved_seg = reserved_seg; __entry->prefree_seg = prefree_seg; ), TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, " "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, " "free_seg:%u, rsv_seg:%d, prefree_seg:%u", show_dev(__entry->dev), __entry->ret, __entry->seg_freed, __entry->sec_freed, __entry->dirty_nodes, __entry->dirty_dents, __entry->dirty_imeta, __entry->free_sec, __entry->free_seg, __entry->reserved_seg, __entry->prefree_seg) ); TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, struct victim_sel_policy *p, unsigned int pre_victim, unsigned int prefree, unsigned int free), TP_ARGS(sb, type, gc_type, p, pre_victim, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) __field(int, gc_type) __field(int, alloc_mode) __field(int, gc_mode) __field(unsigned int, victim) __field(unsigned int, cost) __field(unsigned int, ofs_unit) __field(unsigned int, pre_victim) __field(unsigned int, prefree) __field(unsigned int, free) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->type = type; __entry->gc_type = gc_type; __entry->alloc_mode = p->alloc_mode; __entry->gc_mode = p->gc_mode; __entry->victim = p->min_segno; __entry->cost = p->min_cost; __entry->ofs_unit = p->ofs_unit; __entry->pre_victim = pre_victim; __entry->prefree = prefree; __entry->free = free; ), TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), " "victim = %u, cost = %u, ofs_unit = %u, " "pre_victim_secno = %d, prefree = %u, free = %u", show_dev(__entry->dev), show_data_type(__entry->type), show_gc_type(__entry->gc_type), show_alloc_mode(__entry->alloc_mode), show_victim_policy(__entry->gc_mode), __entry->victim, __entry->cost, __entry->ofs_unit, (int)__entry->pre_victim, __entry->prefree, __entry->free) ); TRACE_EVENT(f2fs_lookup_start, TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), TP_ARGS(dir, dentry, flags), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __string(name, dentry->d_name.name) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; __assign_str(name, dentry->d_name.name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", show_dev_ino(__entry), __get_str(name), __entry->flags) ); TRACE_EVENT(f2fs_lookup_end, TP_PROTO(struct inode *dir, struct dentry *dentry, nid_t ino, int err), TP_ARGS(dir, dentry, ino, err), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __string(name, dentry->d_name.name) __field(nid_t, cino) __field(int, err) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; __assign_str(name, dentry->d_name.name); __entry->cino = ino; __entry->err = err; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", show_dev_ino(__entry), __get_str(name), __entry->cino, __entry->err) ); TRACE_EVENT(f2fs_readdir, TP_PROTO(struct inode *dir, loff_t start_pos, loff_t end_pos, int err), TP_ARGS(dir, start_pos, end_pos, err), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, start) __field(loff_t, end) __field(int, err) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; __entry->start = start_pos; __entry->end = end_pos; __entry->err = err; ), TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d", show_dev_ino(__entry), __entry->start, __entry->end, __entry->err) ); TRACE_EVENT(f2fs_fallocate, TP_PROTO(struct inode *inode, int mode, loff_t offset, loff_t len, int ret), TP_ARGS(inode, mode, offset, len, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, mode) __field(loff_t, offset) __field(loff_t, len) __field(loff_t, size) __field(blkcnt_t, blocks) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->mode = mode; __entry->offset = offset; __entry->len = len; __entry->size = inode->i_size; __entry->blocks = inode->i_blocks; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, mode = %x, offset = %lld, " "len = %lld, i_size = %lld, i_blocks = %llu, ret = %d", show_dev_ino(__entry), __entry->mode, (unsigned long long)__entry->offset, (unsigned long long)__entry->len, (unsigned long long)__entry->size, (unsigned long long)__entry->blocks, __entry->ret) ); TRACE_EVENT(f2fs_direct_IO_enter, TP_PROTO(struct inode *inode, struct kiocb *iocb, long len, int rw), TP_ARGS(inode, iocb, len, rw), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, ki_pos) __field(int, ki_flags) __field(u16, ki_ioprio) __field(unsigned long, len) __field(int, rw) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ki_pos = iocb->ki_pos; __entry->ki_flags = iocb->ki_flags; __entry->ki_ioprio = iocb->ki_ioprio; __entry->len = len; __entry->rw = rw; ), TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu ki_flags = %x ki_ioprio = %x rw = %d", show_dev_ino(__entry), __entry->ki_pos, __entry->len, __entry->ki_flags, __entry->ki_ioprio, __entry->rw) ); TRACE_EVENT(f2fs_direct_IO_exit, TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret), TP_ARGS(inode, offset, len, rw, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, pos) __field(unsigned long, len) __field(int, rw) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = offset; __entry->len = len; __entry->rw = rw; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu pos = %lld len = %lu " "rw = %d ret = %d", show_dev_ino(__entry), __entry->pos, __entry->len, __entry->rw, __entry->ret) ); TRACE_EVENT(f2fs_reserve_new_blocks, TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node, blkcnt_t count), TP_ARGS(inode, nid, ofs_in_node, count), TP_STRUCT__entry( __field(dev_t, dev) __field(nid_t, nid) __field(unsigned int, ofs_in_node) __field(blkcnt_t, count) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = nid; __entry->ofs_in_node = ofs_in_node; __entry->count = count; ), TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", show_dev(__entry->dev), (unsigned int)__entry->nid, __entry->ofs_in_node, (unsigned long long)__entry->count) ); DECLARE_EVENT_CLASS(f2fs__submit_page_bio, TP_PROTO(struct page *page, struct f2fs_io_info *fio), TP_ARGS(page, fio), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, index) __field(block_t, old_blkaddr) __field(block_t, new_blkaddr) __field(enum req_op, op) __field(blk_opf_t, op_flags) __field(int, temp) __field(int, type) ), TP_fast_assign( __entry->dev = page_file_mapping(page)->host->i_sb->s_dev; __entry->ino = page_file_mapping(page)->host->i_ino; __entry->index = page->index; __entry->old_blkaddr = fio->old_blkaddr; __entry->new_blkaddr = fio->new_blkaddr; __entry->op = fio->op; __entry->op_flags = fio->op_flags; __entry->temp = fio->temp; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->op, __entry->op_flags), show_block_temp(__entry->temp), show_block_type(__entry->type)) ); DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio, TP_PROTO(struct page *page, struct f2fs_io_info *fio), TP_ARGS(page, fio), TP_CONDITION(page->mapping) ); DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write, TP_PROTO(struct page *page, struct f2fs_io_info *fio), TP_ARGS(page, fio), TP_CONDITION(page->mapping) ); DECLARE_EVENT_CLASS(f2fs__bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, target) __field(enum req_op, op) __field(blk_opf_t, op_flags) __field(int, type) __field(sector_t, sector) __field(unsigned int, size) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->target = bio_dev(bio); __entry->op = bio_op(bio); __entry->op_flags = bio->bi_opf; __entry->type = type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; ), TP_printk("dev = (%d,%d)/(%d,%d), rw = %s(%s), %s, sector = %lld, size = %u", show_dev(__entry->target), show_dev(__entry->dev), show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, __entry->size) ); DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio, TP_PROTO(struct super_block *sb, int type, struct bio *bio), TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); TRACE_EVENT(f2fs_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, pos) __field(unsigned int, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; ), TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u", show_dev_ino(__entry), (unsigned long long)__entry->pos, __entry->len) ); TRACE_EVENT(f2fs_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, pos) __field(unsigned int, len) __field(unsigned int, copied) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->copied = copied; ), TP_printk("dev = (%d,%d), ino = %lu, pos = %llu, len = %u, copied = %u", show_dev_ino(__entry), (unsigned long long)__entry->pos, __entry->len, __entry->copied) ); DECLARE_EVENT_CLASS(f2fs__page, TP_PROTO(struct page *page, int type), TP_ARGS(page, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, type) __field(int, dir) __field(pgoff_t, index) __field(int, dirty) __field(int, uptodate) ), TP_fast_assign( __entry->dev = page_file_mapping(page)->host->i_sb->s_dev; __entry->ino = page_file_mapping(page)->host->i_ino; __entry->type = type; __entry->dir = S_ISDIR(page_file_mapping(page)->host->i_mode); __entry->index = page->index; __entry->dirty = PageDirty(page); __entry->uptodate = PageUptodate(page); ), TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, " "dirty = %d, uptodate = %d", show_dev_ino(__entry), show_block_type(__entry->type), show_file_type(__entry->dir), (unsigned long)__entry->index, __entry->dirty, __entry->uptodate) ); DEFINE_EVENT(f2fs__page, f2fs_writepage, TP_PROTO(struct page *page, int type), TP_ARGS(page, type) ); DEFINE_EVENT(f2fs__page, f2fs_do_write_data_page, TP_PROTO(struct page *page, int type), TP_ARGS(page, type) ); DEFINE_EVENT(f2fs__page, f2fs_readpage, TP_PROTO(struct page *page, int type), TP_ARGS(page, type) ); DEFINE_EVENT(f2fs__page, f2fs_set_page_dirty, TP_PROTO(struct page *page, int type), TP_ARGS(page, type) ); DEFINE_EVENT(f2fs__page, f2fs_vm_page_mkwrite, TP_PROTO(struct page *page, int type), TP_ARGS(page, type) ); TRACE_EVENT(f2fs_replace_atomic_write_block, TP_PROTO(struct inode *inode, struct inode *cow_inode, pgoff_t index, block_t old_addr, block_t new_addr, bool recovery), TP_ARGS(inode, cow_inode, index, old_addr, new_addr, recovery), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(ino_t, cow_ino) __field(pgoff_t, index) __field(block_t, old_addr) __field(block_t, new_addr) __field(bool, recovery) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->cow_ino = cow_inode->i_ino; __entry->index = index; __entry->old_addr = old_addr; __entry->new_addr = new_addr; __entry->recovery = recovery; ), TP_printk("dev = (%d,%d), ino = %lu, cow_ino = %lu, index = %lu, " "old_addr = 0x%llx, new_addr = 0x%llx, recovery = %d", show_dev_ino(__entry), __entry->cow_ino, (unsigned long)__entry->index, (unsigned long long)__entry->old_addr, (unsigned long long)__entry->new_addr, __entry->recovery) ); TRACE_EVENT(f2fs_filemap_fault, TP_PROTO(struct inode *inode, pgoff_t index, unsigned long ret), TP_ARGS(inode, index, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, index) __field(unsigned long, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->index = index; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, index = %lu, ret = %lx", show_dev_ino(__entry), (unsigned long)__entry->index, __entry->ret) ); TRACE_EVENT(f2fs_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc, int type), TP_ARGS(inode, wbc, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(int, type) __field(int, dir) __field(long, nr_to_write) __field(long, pages_skipped) __field(loff_t, range_start) __field(loff_t, range_end) __field(pgoff_t, writeback_index) __field(int, sync_mode) __field(char, for_kupdate) __field(char, for_background) __field(char, tagged_writepages) __field(char, for_reclaim) __field(char, range_cyclic) __field(char, for_sync) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->type = type; __entry->dir = S_ISDIR(inode->i_mode); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->tagged_writepages = wbc->tagged_writepages; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->for_sync = wbc->for_sync; ), TP_printk("dev = (%d,%d), ino = %lu, %s, %s, nr_to_write %ld, " "skipped %ld, start %lld, end %lld, wb_idx %lu, sync_mode %d, " "kupdate %u background %u tagged %u reclaim %u cyclic %u sync %u", show_dev_ino(__entry), show_block_type(__entry->type), show_file_type(__entry->dir), __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, (unsigned long)__entry->writeback_index, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->tagged_writepages, __entry->for_reclaim, __entry->range_cyclic, __entry->for_sync) ); TRACE_EVENT(f2fs_readpages, TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage), TP_ARGS(inode, start, nrpage), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, start) __field(unsigned int, nrpage) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->nrpage = nrpage; ), TP_printk("dev = (%d,%d), ino = %lu, start = %lu nrpage = %u", show_dev_ino(__entry), (unsigned long)__entry->start, __entry->nrpage) ); TRACE_EVENT(f2fs_write_checkpoint, TP_PROTO(struct super_block *sb, int reason, const char *msg), TP_ARGS(sb, reason, msg), TP_STRUCT__entry( __field(dev_t, dev) __field(int, reason) __string(dest_msg, msg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->reason = reason; __assign_str(dest_msg, msg); ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", show_dev(__entry->dev), show_cpreason(__entry->reason), __get_str(dest_msg)) ); DECLARE_EVENT_CLASS(f2fs_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), TP_ARGS(dev, blkstart, blklen), TP_STRUCT__entry( __field(dev_t, dev) __field(block_t, blkstart) __field(block_t, blklen) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; __entry->blklen = blklen; ), TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx", show_dev(__entry->dev), (unsigned long long)__entry->blkstart, (unsigned long long)__entry->blklen) ); DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), TP_ARGS(dev, blkstart, blklen) ); DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), TP_ARGS(dev, blkstart, blklen) ); DEFINE_EVENT(f2fs_discard, f2fs_remove_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), TP_ARGS(dev, blkstart, blklen) ); DECLARE_EVENT_CLASS(f2fs_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), TP_ARGS(dev, blkstart), TP_STRUCT__entry( __field(dev_t, dev) __field(block_t, blkstart) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; ), TP_printk("dev = (%d,%d), zone at block = 0x%llx", show_dev(__entry->dev), (unsigned long long)__entry->blkstart) ); DEFINE_EVENT(f2fs_reset_zone, f2fs_queue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), TP_ARGS(dev, blkstart) ); DEFINE_EVENT(f2fs_reset_zone, f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), TP_ARGS(dev, blkstart) ); TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct block_device *dev, unsigned int nobarrier, unsigned int flush_merge, int ret), TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) __field(int, ret) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; __entry->ret = ret; ), TP_printk("dev = (%d,%d), %s %s, ret = %d", show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", __entry->flush_merge ? " with flush_merge" : "", __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, TP_PROTO(struct inode *inode, unsigned int pgofs, enum extent_type type), TP_ARGS(inode, pgofs, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->type = type; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, type = %s", show_dev_ino(__entry), __entry->pgofs, show_extent_type(__entry->type)) ); TRACE_EVENT_CONDITION(f2fs_lookup_read_extent_tree_end, TP_PROTO(struct inode *inode, unsigned int pgofs, struct extent_info *ei), TP_ARGS(inode, pgofs, ei), TP_CONDITION(ei), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) __field(unsigned int, len) __field(u32, blk) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->fofs = ei->fofs; __entry->len = ei->len; __entry->blk = ei->blk; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " "read_ext_info(fofs: %u, len: %u, blk: %u)", show_dev_ino(__entry), __entry->pgofs, __entry->fofs, __entry->len, __entry->blk) ); TRACE_EVENT_CONDITION(f2fs_lookup_age_extent_tree_end, TP_PROTO(struct inode *inode, unsigned int pgofs, struct extent_info *ei), TP_ARGS(inode, pgofs, ei), TP_CONDITION(ei), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, fofs) __field(unsigned int, len) __field(unsigned long long, age) __field(unsigned long long, blocks) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->fofs = ei->fofs; __entry->len = ei->len; __entry->age = ei->age; __entry->blocks = ei->last_blocks; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " "age_ext_info(fofs: %u, len: %u, age: %llu, blocks: %llu)", show_dev_ino(__entry), __entry->pgofs, __entry->fofs, __entry->len, __entry->age, __entry->blocks) ); TRACE_EVENT(f2fs_update_read_extent_tree_range, TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, block_t blkaddr, unsigned int c_len), TP_ARGS(inode, pgofs, len, blkaddr, c_len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(u32, blk) __field(unsigned int, len) __field(unsigned int, c_len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->len = len; __entry->blk = blkaddr; __entry->c_len = c_len; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " "len = %u, blkaddr = %u, c_len = %u", show_dev_ino(__entry), __entry->pgofs, __entry->len, __entry->blk, __entry->c_len) ); TRACE_EVENT(f2fs_update_age_extent_tree_range, TP_PROTO(struct inode *inode, unsigned int pgofs, unsigned int len, unsigned long long age, unsigned long long last_blks), TP_ARGS(inode, pgofs, len, age, last_blks), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, pgofs) __field(unsigned int, len) __field(unsigned long long, age) __field(unsigned long long, blocks) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pgofs = pgofs; __entry->len = len; __entry->age = age; __entry->blocks = last_blks; ), TP_printk("dev = (%d,%d), ino = %lu, pgofs = %u, " "len = %u, age = %llu, blocks = %llu", show_dev_ino(__entry), __entry->pgofs, __entry->len, __entry->age, __entry->blocks) ); TRACE_EVENT(f2fs_shrink_extent_tree, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int node_cnt, unsigned int tree_cnt, enum extent_type type), TP_ARGS(sbi, node_cnt, tree_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, node_cnt) __field(unsigned int, tree_cnt) __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->node_cnt = node_cnt; __entry->tree_cnt = tree_cnt; __entry->type = type; ), TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u, type = %s", show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt, show_extent_type(__entry->type)) ); TRACE_EVENT(f2fs_destroy_extent_tree, TP_PROTO(struct inode *inode, unsigned int node_cnt, enum extent_type type), TP_ARGS(inode, node_cnt, type), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(unsigned int, node_cnt) __field(enum extent_type, type) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->node_cnt = node_cnt; __entry->type = type; ), TP_printk("dev = (%d,%d), ino = %lu, destroyed: node_cnt = %u, type = %s", show_dev_ino(__entry), __entry->node_cnt, show_extent_type(__entry->type)) ); DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count), TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) __field(s64, count) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->type = type; __entry->count = count; ), TP_printk("dev = (%d,%d), %s, dirty count = %lld", show_dev(__entry->dev), show_file_type(__entry->type), __entry->count) ); DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter, TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count) ); DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, TP_PROTO(struct super_block *sb, int type, s64 count), TP_ARGS(sb, type, count) ); TRACE_EVENT(f2fs_shutdown, TP_PROTO(struct f2fs_sb_info *sbi, unsigned int mode, int ret), TP_ARGS(sbi, mode, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, mode) __field(int, ret) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->mode = mode; __entry->ret = ret; ), TP_printk("dev = (%d,%d), mode: %s, ret:%d", show_dev(__entry->dev), show_shutdown_mode(__entry->mode), __entry->ret) ); DECLARE_EVENT_CLASS(f2fs_zip_start, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int cluster_size, unsigned char algtype), TP_ARGS(inode, cluster_idx, cluster_size, algtype), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, idx) __field(unsigned int, size) __field(unsigned int, algtype) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->idx = cluster_idx; __entry->size = cluster_size; __entry->algtype = algtype; ), TP_printk("dev = (%d,%d), ino = %lu, cluster_idx:%lu, " "cluster_size = %u, algorithm = %s", show_dev_ino(__entry), __entry->idx, __entry->size, show_compress_algorithm(__entry->algtype)) ); DECLARE_EVENT_CLASS(f2fs_zip_end, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int compressed_size, int ret), TP_ARGS(inode, cluster_idx, compressed_size, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, idx) __field(unsigned int, size) __field(unsigned int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->idx = cluster_idx; __entry->size = compressed_size; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, cluster_idx:%lu, " "compressed_size = %u, ret = %d", show_dev_ino(__entry), __entry->idx, __entry->size, __entry->ret) ); DEFINE_EVENT(f2fs_zip_start, f2fs_compress_pages_start, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int cluster_size, unsigned char algtype), TP_ARGS(inode, cluster_idx, cluster_size, algtype) ); DEFINE_EVENT(f2fs_zip_start, f2fs_decompress_pages_start, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int cluster_size, unsigned char algtype), TP_ARGS(inode, cluster_idx, cluster_size, algtype) ); DEFINE_EVENT(f2fs_zip_end, f2fs_compress_pages_end, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int compressed_size, int ret), TP_ARGS(inode, cluster_idx, compressed_size, ret) ); DEFINE_EVENT(f2fs_zip_end, f2fs_decompress_pages_end, TP_PROTO(struct inode *inode, pgoff_t cluster_idx, unsigned int compressed_size, int ret), TP_ARGS(inode, cluster_idx, compressed_size, ret) ); #ifdef CONFIG_F2FS_IOSTAT TRACE_EVENT(f2fs_iostat, TP_PROTO(struct f2fs_sb_info *sbi, unsigned long long *iostat), TP_ARGS(sbi, iostat), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned long long, app_dio) __field(unsigned long long, app_bio) __field(unsigned long long, app_wio) __field(unsigned long long, app_mio) __field(unsigned long long, app_bcdio) __field(unsigned long long, app_mcdio) __field(unsigned long long, fs_dio) __field(unsigned long long, fs_cdio) __field(unsigned long long, fs_nio) __field(unsigned long long, fs_mio) __field(unsigned long long, fs_gc_dio) __field(unsigned long long, fs_gc_nio) __field(unsigned long long, fs_cp_dio) __field(unsigned long long, fs_cp_nio) __field(unsigned long long, fs_cp_mio) __field(unsigned long long, app_drio) __field(unsigned long long, app_brio) __field(unsigned long long, app_rio) __field(unsigned long long, app_mrio) __field(unsigned long long, app_bcrio) __field(unsigned long long, app_mcrio) __field(unsigned long long, fs_drio) __field(unsigned long long, fs_gdrio) __field(unsigned long long, fs_cdrio) __field(unsigned long long, fs_nrio) __field(unsigned long long, fs_mrio) __field(unsigned long long, fs_discard) __field(unsigned long long, fs_reset_zone) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->app_dio = iostat[APP_DIRECT_IO]; __entry->app_bio = iostat[APP_BUFFERED_IO]; __entry->app_wio = iostat[APP_WRITE_IO]; __entry->app_mio = iostat[APP_MAPPED_IO]; __entry->app_bcdio = iostat[APP_BUFFERED_CDATA_IO]; __entry->app_mcdio = iostat[APP_MAPPED_CDATA_IO]; __entry->fs_dio = iostat[FS_DATA_IO]; __entry->fs_cdio = iostat[FS_CDATA_IO]; __entry->fs_nio = iostat[FS_NODE_IO]; __entry->fs_mio = iostat[FS_META_IO]; __entry->fs_gc_dio = iostat[FS_GC_DATA_IO]; __entry->fs_gc_nio = iostat[FS_GC_NODE_IO]; __entry->fs_cp_dio = iostat[FS_CP_DATA_IO]; __entry->fs_cp_nio = iostat[FS_CP_NODE_IO]; __entry->fs_cp_mio = iostat[FS_CP_META_IO]; __entry->app_drio = iostat[APP_DIRECT_READ_IO]; __entry->app_brio = iostat[APP_BUFFERED_READ_IO]; __entry->app_rio = iostat[APP_READ_IO]; __entry->app_mrio = iostat[APP_MAPPED_READ_IO]; __entry->app_bcrio = iostat[APP_BUFFERED_CDATA_READ_IO]; __entry->app_mcrio = iostat[APP_MAPPED_CDATA_READ_IO]; __entry->fs_drio = iostat[FS_DATA_READ_IO]; __entry->fs_gdrio = iostat[FS_GDATA_READ_IO]; __entry->fs_cdrio = iostat[FS_CDATA_READ_IO]; __entry->fs_nrio = iostat[FS_NODE_READ_IO]; __entry->fs_mrio = iostat[FS_META_READ_IO]; __entry->fs_discard = iostat[FS_DISCARD_IO]; __entry->fs_reset_zone = iostat[FS_ZONE_RESET_IO]; ), TP_printk("dev = (%d,%d), " "app [write=%llu (direct=%llu, buffered=%llu), mapped=%llu, " "compr(buffered=%llu, mapped=%llu)], " "fs [data=%llu, cdata=%llu, node=%llu, meta=%llu, discard=%llu, " "reset_zone=%llu], " "gc [data=%llu, node=%llu], " "cp [data=%llu, node=%llu, meta=%llu], " "app [read=%llu (direct=%llu, buffered=%llu), mapped=%llu], " "compr(buffered=%llu, mapped=%llu)], " "fs [data=%llu, (gc_data=%llu, cdata=%llu), " "node=%llu, meta=%llu]", show_dev(__entry->dev), __entry->app_wio, __entry->app_dio, __entry->app_bio, __entry->app_mio, __entry->app_bcdio, __entry->app_mcdio, __entry->fs_dio, __entry->fs_cdio, __entry->fs_nio, __entry->fs_mio, __entry->fs_discard, __entry->fs_reset_zone, __entry->fs_gc_dio, __entry->fs_gc_nio, __entry->fs_cp_dio, __entry->fs_cp_nio, __entry->fs_cp_mio, __entry->app_rio, __entry->app_drio, __entry->app_brio, __entry->app_mrio, __entry->app_bcrio, __entry->app_mcrio, __entry->fs_drio, __entry->fs_gdrio, __entry->fs_cdrio, __entry->fs_nrio, __entry->fs_mrio) ); #ifndef __F2FS_IOSTAT_LATENCY_TYPE #define __F2FS_IOSTAT_LATENCY_TYPE struct f2fs_iostat_latency { unsigned int peak_lat; unsigned int avg_lat; unsigned int cnt; }; #endif /* __F2FS_IOSTAT_LATENCY_TYPE */ TRACE_EVENT(f2fs_iostat_latency, TP_PROTO(struct f2fs_sb_info *sbi, struct f2fs_iostat_latency (*iostat_lat)[NR_PAGE_TYPE]), TP_ARGS(sbi, iostat_lat), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, d_rd_peak) __field(unsigned int, d_rd_avg) __field(unsigned int, d_rd_cnt) __field(unsigned int, n_rd_peak) __field(unsigned int, n_rd_avg) __field(unsigned int, n_rd_cnt) __field(unsigned int, m_rd_peak) __field(unsigned int, m_rd_avg) __field(unsigned int, m_rd_cnt) __field(unsigned int, d_wr_s_peak) __field(unsigned int, d_wr_s_avg) __field(unsigned int, d_wr_s_cnt) __field(unsigned int, n_wr_s_peak) __field(unsigned int, n_wr_s_avg) __field(unsigned int, n_wr_s_cnt) __field(unsigned int, m_wr_s_peak) __field(unsigned int, m_wr_s_avg) __field(unsigned int, m_wr_s_cnt) __field(unsigned int, d_wr_as_peak) __field(unsigned int, d_wr_as_avg) __field(unsigned int, d_wr_as_cnt) __field(unsigned int, n_wr_as_peak) __field(unsigned int, n_wr_as_avg) __field(unsigned int, n_wr_as_cnt) __field(unsigned int, m_wr_as_peak) __field(unsigned int, m_wr_as_avg) __field(unsigned int, m_wr_as_cnt) ), TP_fast_assign( __entry->dev = sbi->sb->s_dev; __entry->d_rd_peak = iostat_lat[READ_IO][DATA].peak_lat; __entry->d_rd_avg = iostat_lat[READ_IO][DATA].avg_lat; __entry->d_rd_cnt = iostat_lat[READ_IO][DATA].cnt; __entry->n_rd_peak = iostat_lat[READ_IO][NODE].peak_lat; __entry->n_rd_avg = iostat_lat[READ_IO][NODE].avg_lat; __entry->n_rd_cnt = iostat_lat[READ_IO][NODE].cnt; __entry->m_rd_peak = iostat_lat[READ_IO][META].peak_lat; __entry->m_rd_avg = iostat_lat[READ_IO][META].avg_lat; __entry->m_rd_cnt = iostat_lat[READ_IO][META].cnt; __entry->d_wr_s_peak = iostat_lat[WRITE_SYNC_IO][DATA].peak_lat; __entry->d_wr_s_avg = iostat_lat[WRITE_SYNC_IO][DATA].avg_lat; __entry->d_wr_s_cnt = iostat_lat[WRITE_SYNC_IO][DATA].cnt; __entry->n_wr_s_peak = iostat_lat[WRITE_SYNC_IO][NODE].peak_lat; __entry->n_wr_s_avg = iostat_lat[WRITE_SYNC_IO][NODE].avg_lat; __entry->n_wr_s_cnt = iostat_lat[WRITE_SYNC_IO][NODE].cnt; __entry->m_wr_s_peak = iostat_lat[WRITE_SYNC_IO][META].peak_lat; __entry->m_wr_s_avg = iostat_lat[WRITE_SYNC_IO][META].avg_lat; __entry->m_wr_s_cnt = iostat_lat[WRITE_SYNC_IO][META].cnt; __entry->d_wr_as_peak = iostat_lat[WRITE_ASYNC_IO][DATA].peak_lat; __entry->d_wr_as_avg = iostat_lat[WRITE_ASYNC_IO][DATA].avg_lat; __entry->d_wr_as_cnt = iostat_lat[WRITE_ASYNC_IO][DATA].cnt; __entry->n_wr_as_peak = iostat_lat[WRITE_ASYNC_IO][NODE].peak_lat; __entry->n_wr_as_avg = iostat_lat[WRITE_ASYNC_IO][NODE].avg_lat; __entry->n_wr_as_cnt = iostat_lat[WRITE_ASYNC_IO][NODE].cnt; __entry->m_wr_as_peak = iostat_lat[WRITE_ASYNC_IO][META].peak_lat; __entry->m_wr_as_avg = iostat_lat[WRITE_ASYNC_IO][META].avg_lat; __entry->m_wr_as_cnt = iostat_lat[WRITE_ASYNC_IO][META].cnt; ), TP_printk("dev = (%d,%d), " "iotype [peak lat.(ms)/avg lat.(ms)/count], " "rd_data [%u/%u/%u], rd_node [%u/%u/%u], rd_meta [%u/%u/%u], " "wr_sync_data [%u/%u/%u], wr_sync_node [%u/%u/%u], " "wr_sync_meta [%u/%u/%u], wr_async_data [%u/%u/%u], " "wr_async_node [%u/%u/%u], wr_async_meta [%u/%u/%u]", show_dev(__entry->dev), __entry->d_rd_peak, __entry->d_rd_avg, __entry->d_rd_cnt, __entry->n_rd_peak, __entry->n_rd_avg, __entry->n_rd_cnt, __entry->m_rd_peak, __entry->m_rd_avg, __entry->m_rd_cnt, __entry->d_wr_s_peak, __entry->d_wr_s_avg, __entry->d_wr_s_cnt, __entry->n_wr_s_peak, __entry->n_wr_s_avg, __entry->n_wr_s_cnt, __entry->m_wr_s_peak, __entry->m_wr_s_avg, __entry->m_wr_s_cnt, __entry->d_wr_as_peak, __entry->d_wr_as_avg, __entry->d_wr_as_cnt, __entry->n_wr_as_peak, __entry->n_wr_as_avg, __entry->n_wr_as_cnt, __entry->m_wr_as_peak, __entry->m_wr_as_avg, __entry->m_wr_as_cnt) ); #endif TRACE_EVENT(f2fs_bmap, TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock), TP_ARGS(inode, lblock, pblock), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(sector_t, lblock) __field(sector_t, pblock) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblock = lblock; __entry->pblock = pblock; ), TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld", show_dev_ino(__entry), (unsigned long long)__entry->lblock, (unsigned long long)__entry->pblock) ); TRACE_EVENT(f2fs_fiemap, TP_PROTO(struct inode *inode, sector_t lblock, sector_t pblock, unsigned long long len, unsigned int flags, int ret), TP_ARGS(inode, lblock, pblock, len, flags, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(sector_t, lblock) __field(sector_t, pblock) __field(unsigned long long, len) __field(unsigned int, flags) __field(int, ret) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblock = lblock; __entry->pblock = pblock; __entry->len = len; __entry->flags = flags; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, lblock:%lld, pblock:%lld, " "len:%llu, flags:%u, ret:%d", show_dev_ino(__entry), (unsigned long long)__entry->lblock, (unsigned long long)__entry->pblock, __entry->len, __entry->flags, __entry->ret) ); DECLARE_EVENT_CLASS(f2fs__rw_start, TP_PROTO(struct inode *inode, loff_t offset, int bytes, pid_t pid, char *pathname, char *command), TP_ARGS(inode, offset, bytes, pid, pathname, command), TP_STRUCT__entry( __string(pathbuf, pathname) __field(loff_t, offset) __field(int, bytes) __field(loff_t, i_size) __string(cmdline, command) __field(pid_t, pid) __field(ino_t, ino) ), TP_fast_assign( /* * Replace the spaces in filenames and cmdlines * because this screws up the tooling that parses * the traces. */ __assign_str(pathbuf, pathname); (void)strreplace(__get_str(pathbuf), ' ', '_'); __entry->offset = offset; __entry->bytes = bytes; __entry->i_size = i_size_read(inode); __assign_str(cmdline, command); (void)strreplace(__get_str(cmdline), ' ', '_'); __entry->pid = pid; __entry->ino = inode->i_ino; ), TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s," " pid %d, i_size %llu, ino %lu", __get_str(pathbuf), __entry->offset, __entry->bytes, __get_str(cmdline), __entry->pid, __entry->i_size, (unsigned long) __entry->ino) ); DECLARE_EVENT_CLASS(f2fs__rw_end, TP_PROTO(struct inode *inode, loff_t offset, int bytes), TP_ARGS(inode, offset, bytes), TP_STRUCT__entry( __field(ino_t, ino) __field(loff_t, offset) __field(int, bytes) ), TP_fast_assign( __entry->ino = inode->i_ino; __entry->offset = offset; __entry->bytes = bytes; ), TP_printk("ino %lu, offset %llu, bytes %d", (unsigned long) __entry->ino, __entry->offset, __entry->bytes) ); DEFINE_EVENT(f2fs__rw_start, f2fs_dataread_start, TP_PROTO(struct inode *inode, loff_t offset, int bytes, pid_t pid, char *pathname, char *command), TP_ARGS(inode, offset, bytes, pid, pathname, command) ); DEFINE_EVENT(f2fs__rw_end, f2fs_dataread_end, TP_PROTO(struct inode *inode, loff_t offset, int bytes), TP_ARGS(inode, offset, bytes) ); DEFINE_EVENT(f2fs__rw_start, f2fs_datawrite_start, TP_PROTO(struct inode *inode, loff_t offset, int bytes, pid_t pid, char *pathname, char *command), TP_ARGS(inode, offset, bytes, pid, pathname, command) ); DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end, TP_PROTO(struct inode *inode, loff_t offset, int bytes), TP_ARGS(inode, offset, bytes) ); #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
214 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TEXT_PATCHING_H #define _ASM_X86_TEXT_PATCHING_H #include <linux/types.h> #include <linux/stddef.h> #include <asm/ptrace.h> struct paravirt_patch_site; #ifdef CONFIG_PARAVIRT void apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *end); #else static inline void apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *end) {} #define __parainstructions NULL #define __parainstructions_end NULL #endif /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ #define POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. * Side-effect: any interrupt handler running between save and restore will have * the ability to write to read-only pages. * * Warning: * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * On the local CPU you need to be protected against NMI or MCE handlers seeing * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC #define RET_INSN_SIZE 1 #define RET_INSN_OPCODE 0xC3 #define CALL_INSN_SIZE 5 #define CALL_INSN_OPCODE 0xE8 #define JMP32_INSN_SIZE 5 #define JMP32_INSN_OPCODE 0xE9 #define JMP8_INSN_SIZE 2 #define JMP8_INSN_OPCODE 0xEB #define DISP32_SIZE 4 static __always_inline int text_opcode_size(u8 opcode) { int size = 0; #define __CASE(insn) \ case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break switch(opcode) { __CASE(INT3); __CASE(RET); __CASE(CALL); __CASE(JMP32); __CASE(JMP8); } #undef __CASE return size; } union text_poke_insn { u8 text[POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; } __attribute__((packed)); }; static __always_inline void __text_gen_insn(void *buf, u8 opcode, const void *addr, const void *dest, int size) { union text_poke_insn *insn = buf; BUG_ON(size < text_opcode_size(opcode)); /* * Hide the addresses to avoid the compiler folding in constants when * referencing code, these can mess up annotations like * ANNOTATE_NOENDBR. */ OPTIMIZER_HIDE_VAR(insn); OPTIMIZER_HIDE_VAR(addr); OPTIMIZER_HIDE_VAR(dest); insn->opcode = opcode; if (size > 1) { insn->disp = (long)dest - (long)(addr + size); if (size == 2) { /* * Ensure that for JMP8 the displacement * actually fits the signed byte. */ BUG_ON((insn->disp >> 31) != (insn->disp >> 7)); } } } static __always_inline void *text_gen_insn(u8 opcode, const void *addr, const void *dest) { static union text_poke_insn insn; /* per instance */ __text_gen_insn(&insn, opcode, addr, dest, text_opcode_size(opcode)); return &insn.text; } extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; #ifndef CONFIG_UML_X86 static __always_inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) { regs->ip = ip; } static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* * The int3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of * this gap. See the idtentry macro's create_gap option. * * Similarly entry_32.S will have a gap on the stack for (any) hardware * exception and pt_regs; see FIXUP_FRAME. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; } static __always_inline unsigned long int3_emulate_pop(struct pt_regs *regs) { unsigned long val = *(unsigned long *)regs->sp; regs->sp += sizeof(unsigned long); return val; } static __always_inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) { int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); } static __always_inline void int3_emulate_ret(struct pt_regs *regs) { unsigned long ip = int3_emulate_pop(regs); int3_emulate_jmp(regs, ip); } static __always_inline void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp) { static const unsigned long jcc_mask[6] = { [0] = X86_EFLAGS_OF, [1] = X86_EFLAGS_CF, [2] = X86_EFLAGS_ZF, [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, [4] = X86_EFLAGS_SF, [5] = X86_EFLAGS_PF, }; bool invert = cc & 1; bool match; if (cc < 0xc) { match = regs->flags & jcc_mask[cc >> 1]; } else { match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); if (cc >= 0xe) match = match || (regs->flags & X86_EFLAGS_ZF); } if ((match && !invert) || (!match && invert)) ip += disp; int3_emulate_jmp(regs, ip); } #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */
16 1 14 1 2 4 1 2 5 1 4 1 8 4 2 1 1 8 2 3 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match L2TP header parameters. */ /* (C) 2013 James Chapman <jchapman@katalix.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/if_ether.h> #include <net/ip.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/udp.h> #include <linux/l2tp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter/xt_l2tp.h> /* L2TP header masks */ #define L2TP_HDR_T_BIT 0x8000 #define L2TP_HDR_L_BIT 0x4000 #define L2TP_HDR_VER 0x000f MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("Xtables: L2TP header match"); MODULE_ALIAS("ipt_l2tp"); MODULE_ALIAS("ip6t_l2tp"); /* The L2TP fields that can be matched */ struct l2tp_data { u32 tid; u32 sid; u8 type; u8 version; }; union l2tp_val { __be16 val16[2]; __be32 val32; }; static bool l2tp_match(const struct xt_l2tp_info *info, struct l2tp_data *data) { if ((info->flags & XT_L2TP_TYPE) && (info->type != data->type)) return false; if ((info->flags & XT_L2TP_VERSION) && (info->version != data->version)) return false; /* Check tid only for L2TPv3 control or any L2TPv2 packets */ if ((info->flags & XT_L2TP_TID) && ((data->type == XT_L2TP_TYPE_CONTROL) || (data->version == 2)) && (info->tid != data->tid)) return false; /* Check sid only for L2TP data packets */ if ((info->flags & XT_L2TP_SID) && (data->type == XT_L2TP_TYPE_DATA) && (info->sid != data->sid)) return false; return true; } /* Parse L2TP header fields when UDP encapsulation is used. Handles * L2TPv2 and L2TPv3. Note the L2TPv3 control and data packets have a * different format. See * RFC2661, Section 3.1, L2TPv2 Header Format * RFC3931, Section 3.2.1, L2TPv3 Control Message Header * RFC3931, Section 3.2.2, L2TPv3 Data Message Header * RFC3931, Section 4.1.2.1, L2TPv3 Session Header over UDP */ static bool l2tp_udp_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) { const struct xt_l2tp_info *info = par->matchinfo; int uhlen = sizeof(struct udphdr); int offs = thoff + uhlen; union l2tp_val *lh; union l2tp_val lhbuf; u16 flags; struct l2tp_data data = { 0, }; if (par->fragoff != 0) return false; /* Extract L2TP header fields. The flags in the first 16 bits * tell us where the other fields are. */ lh = skb_header_pointer(skb, offs, 2, &lhbuf); if (lh == NULL) return false; flags = ntohs(lh->val16[0]); if (flags & L2TP_HDR_T_BIT) data.type = XT_L2TP_TYPE_CONTROL; else data.type = XT_L2TP_TYPE_DATA; data.version = (u8) flags & L2TP_HDR_VER; /* Now extract the L2TP tid/sid. These are in different places * for L2TPv2 (rfc2661) and L2TPv3 (rfc3931). For L2TPv2, we * must also check to see if the length field is present, * since this affects the offsets into the packet of the * tid/sid fields. */ if (data.version == 3) { lh = skb_header_pointer(skb, offs + 4, 4, &lhbuf); if (lh == NULL) return false; if (data.type == XT_L2TP_TYPE_CONTROL) data.tid = ntohl(lh->val32); else data.sid = ntohl(lh->val32); } else if (data.version == 2) { if (flags & L2TP_HDR_L_BIT) offs += 2; lh = skb_header_pointer(skb, offs + 2, 4, &lhbuf); if (lh == NULL) return false; data.tid = (u32) ntohs(lh->val16[0]); data.sid = (u32) ntohs(lh->val16[1]); } else return false; return l2tp_match(info, &data); } /* Parse L2TP header fields for IP encapsulation (no UDP header). * L2TPv3 data packets have a different form with IP encap. See * RC3931, Section 4.1.1.1, L2TPv3 Session Header over IP. * RC3931, Section 4.1.1.2, L2TPv3 Control and Data Traffic over IP. */ static bool l2tp_ip_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 thoff) { const struct xt_l2tp_info *info = par->matchinfo; union l2tp_val *lh; union l2tp_val lhbuf; struct l2tp_data data = { 0, }; /* For IP encap, the L2TP sid is the first 32-bits. */ lh = skb_header_pointer(skb, thoff, sizeof(lhbuf), &lhbuf); if (lh == NULL) return false; if (lh->val32 == 0) { /* Must be a control packet. The L2TP tid is further * into the packet. */ data.type = XT_L2TP_TYPE_CONTROL; lh = skb_header_pointer(skb, thoff + 8, sizeof(lhbuf), &lhbuf); if (lh == NULL) return false; data.tid = ntohl(lh->val32); } else { data.sid = ntohl(lh->val32); data.type = XT_L2TP_TYPE_DATA; } data.version = 3; return l2tp_match(info, &data); } static bool l2tp_mt4(const struct sk_buff *skb, struct xt_action_param *par) { struct iphdr *iph = ip_hdr(skb); u8 ipproto = iph->protocol; /* l2tp_mt_check4 already restricts the transport protocol */ switch (ipproto) { case IPPROTO_UDP: return l2tp_udp_mt(skb, par, par->thoff); case IPPROTO_L2TP: return l2tp_ip_mt(skb, par, par->thoff); } return false; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static bool l2tp_mt6(const struct sk_buff *skb, struct xt_action_param *par) { unsigned int thoff = 0; unsigned short fragoff = 0; int ipproto; ipproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); if (fragoff != 0) return false; /* l2tp_mt_check6 already restricts the transport protocol */ switch (ipproto) { case IPPROTO_UDP: return l2tp_udp_mt(skb, par, thoff); case IPPROTO_L2TP: return l2tp_ip_mt(skb, par, thoff); } return false; } #endif static int l2tp_mt_check(const struct xt_mtchk_param *par) { const struct xt_l2tp_info *info = par->matchinfo; /* Check for invalid flags */ if (info->flags & ~(XT_L2TP_TID | XT_L2TP_SID | XT_L2TP_VERSION | XT_L2TP_TYPE)) { pr_info_ratelimited("unknown flags: %x\n", info->flags); return -EINVAL; } /* At least one of tid, sid or type=control must be specified */ if ((!(info->flags & XT_L2TP_TID)) && (!(info->flags & XT_L2TP_SID)) && ((!(info->flags & XT_L2TP_TYPE)) || (info->type != XT_L2TP_TYPE_CONTROL))) { pr_info_ratelimited("invalid flags combination: %x\n", info->flags); return -EINVAL; } /* If version 2 is specified, check that incompatible params * are not supplied */ if (info->flags & XT_L2TP_VERSION) { if ((info->version < 2) || (info->version > 3)) { pr_info_ratelimited("wrong L2TP version: %u\n", info->version); return -EINVAL; } if (info->version == 2) { if ((info->flags & XT_L2TP_TID) && (info->tid > 0xffff)) { pr_info_ratelimited("v2 tid > 0xffff: %u\n", info->tid); return -EINVAL; } if ((info->flags & XT_L2TP_SID) && (info->sid > 0xffff)) { pr_info_ratelimited("v2 sid > 0xffff: %u\n", info->sid); return -EINVAL; } } } return 0; } static int l2tp_mt_check4(const struct xt_mtchk_param *par) { const struct xt_l2tp_info *info = par->matchinfo; const struct ipt_entry *e = par->entryinfo; const struct ipt_ip *ip = &e->ip; int ret; ret = l2tp_mt_check(par); if (ret != 0) return ret; if ((ip->proto != IPPROTO_UDP) && (ip->proto != IPPROTO_L2TP)) { pr_info_ratelimited("missing protocol rule (udp|l2tpip)\n"); return -EINVAL; } if ((ip->proto == IPPROTO_L2TP) && (info->version == 2)) { pr_info_ratelimited("v2 doesn't support IP mode\n"); return -EINVAL; } return 0; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int l2tp_mt_check6(const struct xt_mtchk_param *par) { const struct xt_l2tp_info *info = par->matchinfo; const struct ip6t_entry *e = par->entryinfo; const struct ip6t_ip6 *ip = &e->ipv6; int ret; ret = l2tp_mt_check(par); if (ret != 0) return ret; if ((ip->proto != IPPROTO_UDP) && (ip->proto != IPPROTO_L2TP)) { pr_info_ratelimited("missing protocol rule (udp|l2tpip)\n"); return -EINVAL; } if ((ip->proto == IPPROTO_L2TP) && (info->version == 2)) { pr_info_ratelimited("v2 doesn't support IP mode\n"); return -EINVAL; } return 0; } #endif static struct xt_match l2tp_mt_reg[] __read_mostly = { { .name = "l2tp", .revision = 0, .family = NFPROTO_IPV4, .match = l2tp_mt4, .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), .checkentry = l2tp_mt_check4, .hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD)), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "l2tp", .revision = 0, .family = NFPROTO_IPV6, .match = l2tp_mt6, .matchsize = XT_ALIGN(sizeof(struct xt_l2tp_info)), .checkentry = l2tp_mt_check6, .hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD)), .me = THIS_MODULE, }, #endif }; static int __init l2tp_mt_init(void) { return xt_register_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); } static void __exit l2tp_mt_exit(void) { xt_unregister_matches(&l2tp_mt_reg[0], ARRAY_SIZE(l2tp_mt_reg)); } module_init(l2tp_mt_init); module_exit(l2tp_mt_exit);
24 25 37 22 16 7 6 19 10 6 13 15 15 9 9 15 13 2 15 8 1 37 37 23 9 1 2 1 2 2 11 8 410 410 410 37 25 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The IP fragmentation functionality. * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * Alan Cox : Split from ip.c , see ip_input.c for history. * David S. Miller : Begin massive cleanup... * Andi Kleen : Add sysctls. * xxxx : Overlapfrag bug. * Ultima : ip_expire() kernel panic. * Bill Hawes : Frag accounting and evictor fixes. * John McDonald : 0 length frag bug. * Alexey Kuznetsov: SMP races, threading, cleanup. * Patrick McHardy : LRU queue of frag heads for evictor. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/compiler.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/jiffies.h> #include <linux/skbuff.h> #include <linux/list.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> #include <linux/jhash.h> #include <linux/random.h> #include <linux/slab.h> #include <net/route.h> #include <net/dst.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> #include <net/inet_frag.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/inet.h> #include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> #include <net/l3mdev.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ static const char ip_frag_cache_name[] = "ip4-frags"; /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; unsigned int rid; struct inet_peer *peer; }; static u8 ip4_frag_ecn(u8 tos) { return 1 << (tos & INET_ECN_MASK); } static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); struct net *net = q->fqdir->net; const struct frag_v4_compare_key *key = a; q->key.v4 = *key; qp->ecn = 0; qp->peer = q->fqdir->max_dist ? inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } static void ip4_frag_free(struct inet_frag_queue *q) { struct ipq *qp; qp = container_of(q, struct ipq, q); if (qp->peer) inet_putpeer(qp->peer); } /* Destruction primitives. */ static void ipq_put(struct ipq *ipq) { inet_frag_put(&ipq->q); } /* Kill ipq entry. It is not destroyed immediately, * because caller (and someone more) holds reference count. */ static void ipq_kill(struct ipq *ipq) { inet_frag_kill(&ipq->q); } static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, __IP_DEFRAG_CONNTRACK_IN_END) || ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, __IP_DEFRAG_CONNTRACK_BRIDGE_IN); } /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; struct ipq *qp; int err; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; rcu_read_lock(); /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ if (READ_ONCE(qp->q.fqdir->dead)) goto out_rcu_unlock; spin_lock(&qp->q.lock); if (qp->q.flags & INET_FRAG_COMPLETE) goto out; qp->q.flags |= INET_FRAG_DROP; ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN)) goto out; /* sk_buff::dev and sk_buff::rbnode are unionized. So we * pull the head out of the tree in order to be able to * deal with head->dev. */ head = inet_frag_pull_head(&qp->q); if (!head) goto out; head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); if (err) goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; spin_unlock(&qp->q.lock); icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); goto out_rcu_unlock; out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); ipq_put(qp); } /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { struct frag_v4_compare_key key = { .saddr = iph->saddr, .daddr = iph->daddr, .user = user, .vif = vif, .id = iph->id, .protocol = iph->protocol, }; struct inet_frag_queue *q; q = inet_frag_find(net->ipv4.fqdir, &key); if (!q) return NULL; return container_of(q, struct ipq, q); } /* Is the fragment too far ahead to be part of ipq? */ static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; unsigned int max = qp->q.fqdir->max_dist; unsigned int start, end; int rc; if (!peer || !max) return 0; start = qp->rid; end = atomic_inc_return(&peer->rid); qp->rid = end; rc = qp->q.fragments_tail && (end - start) > max; if (rc) __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS); return rc; } static int ip_frag_reinit(struct ipq *qp) { unsigned int sum_truesize = 0; if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { refcount_inc(&qp->q.refcnt); return -ETIMEDOUT; } sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments, SKB_DROP_REASON_FRAG_TOO_FAR); sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; qp->q.len = 0; qp->q.meat = 0; qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; qp->iif = 0; qp->ecn = 0; return 0; } /* Add new segment to existing queue. */ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; int err = -ENOENT; SKB_DR(reason); u8 ecn; /* If reassembly is already done, @skb must be a duplicate frag. */ if (qp->q.flags & INET_FRAG_COMPLETE) { SKB_DR_SET(reason, DUP_FRAG); goto err; } if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { ipq_kill(qp); goto err; } ecn = ip4_frag_ecn(ip_hdr(skb)->tos); offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; offset <<= 3; /* offset is in 8-byte chunks */ ihl = ip_hdrlen(skb); /* Determine the position of this fragment. */ end = offset + skb->len - skb_network_offset(skb) - ihl; err = -EINVAL; /* Is this the final fragment? */ if ((flags & IP_MF) == 0) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < qp->q.len || ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) goto discard_qp; qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { end &= ~7; if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->ip_summed = CHECKSUM_NONE; } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ if (qp->q.flags & INET_FRAG_LAST_IN) goto discard_qp; qp->q.len = end; } } if (end == offset) goto discard_qp; err = -ENOMEM; if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) goto discard_qp; err = pskb_trim_rcsum(skb, end - offset); if (err) goto discard_qp; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); prev_tail = qp->q.fragments_tail; err = inet_frag_queue_insert(&qp->q, skb, offset, end); if (err) goto insert_error; if (dev) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; qp->q.mono_delivery_time = skb->mono_delivery_time; qp->q.meat += skb->len; qp->ecn |= ecn; add_frag_mem_limit(qp->q.fqdir, skb->truesize); if (offset == 0) qp->q.flags |= INET_FRAG_FIRST_IN; fragsize = skb->len + ihl; if (fragsize > qp->q.max_size) qp->q.max_size = fragsize; if (ip_hdr(skb)->frag_off & htons(IP_DF) && fragsize > qp->max_df_size) qp->max_df_size = fragsize; if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && qp->q.meat == qp->q.len) { unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; err = ip_frag_reasm(qp, skb, prev_tail, dev); skb->_skb_refdst = orefdst; if (err) inet_frag_kill(&qp->q); return err; } skb_dst_drop(skb); return -EINPROGRESS; insert_error: if (err == IPFRAG_DUP) { SKB_DR_SET(reason, DUP_FRAG); err = -EINVAL; goto err; } err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); return err; } static bool ip_frag_coalesce_ok(const struct ipq *qp) { return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER; } /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = qp->q.fqdir->net; struct iphdr *iph; void *reasm_data; int len, err; u8 ecn; ipq_kill(qp); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { err = -EINVAL; goto out_fail; } /* Make the one we just received the head. */ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); if (!reasm_data) goto out_nomem; len = ip_hdrlen(skb) + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; inet_frag_reasm_finish(&qp->q, skb, reasm_data, ip_frag_coalesce_ok(qp)); skb->dev = dev; IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); iph = ip_hdr(skb); iph->tot_len = htons(len); iph->tos |= ecn; /* When we set IP_DF on a refragmented skb we must also force a * call to ip_fragment to avoid forwarding a DF-skb of size s while * original sender only sent fragments of size f (where f < s). * * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest * frag seen to avoid sending tiny DF-fragments in case skb was built * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { IPCB(skb)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; } ip_send_check(iph); __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.rb_fragments = RB_ROOT; qp->q.fragments_tail = NULL; qp->q.last_run_head = NULL; return 0; out_nomem: net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); err = -ENOMEM; goto out_fail; out_oversize: net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; } /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret; spin_lock(&qp->q.lock); ret = ip_frag_queue(qp, skb); spin_unlock(&qp->q.lock); ipq_put(qp); return ret; } __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) { struct iphdr iph; int netoff; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; netoff = skb_network_offset(skb); if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) return skb; len = ntohs(iph.tot_len); if (skb->len < netoff + len || len < (iph.ihl * 4)) return skb; if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) { kfree_skb(skb); return NULL; } if (pskb_trim_rcsum(skb, netoff + len)) { kfree_skb(skb); return NULL; } memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(net, skb, user)) return NULL; skb_clear_hash(skb); } } return skb; } EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL static int dist_min; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ipfrag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "ipfrag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "ipfrag_max_dist", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &dist_min, }, { } }; /* secret interval has been deprecated */ static int ip4_frags_secret_interval_unused; static struct ctl_table ip4_frags_ctl_table[] = { { .procname = "ipfrag_secret_interval", .data = &ip4_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; static int __net_init ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip4_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &net->ipv4.fqdir->high_thresh; table[0].extra1 = &net->ipv4.fqdir->low_thresh; table[1].data = &net->ipv4.fqdir->low_thresh; table[1].extra2 = &net->ipv4.fqdir->high_thresh; table[2].data = &net->ipv4.fqdir->timeout; table[3].data = &net->ipv4.fqdir->max_dist; hdr = register_net_sysctl_sz(net, "net/ipv4", table, ARRAY_SIZE(ip4_frags_ns_ctl_table)); if (!hdr) goto err_reg; net->ipv4.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; table = net->ipv4.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.frags_hdr); kfree(table); } static void __init ip4_frags_ctl_register(void) { register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table); } #else static int ip4_frags_ns_ctl_register(struct net *net) { return 0; } static void ip4_frags_ns_ctl_unregister(struct net *net) { } static void __init ip4_frags_ctl_register(void) { } #endif static int __net_init ipv4_frags_init_net(struct net *net) { int res; res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net); if (res < 0) return res; /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for * the real memory usage, by measuring both the size of frag * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue)) * and the SKB's truesize. * * A 64K fragment consumes 129736 bytes (44*2944)+200 * (1500 truesize == 2944, sizeof(struct ipq) == 200) * * We will commit 4MB at one time. Should we cross that limit * we will prune down to 3MB, making room for approx 8 big 64K * fragments 8x128k. */ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024; net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024; /* * Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival * by TTL. */ net->ipv4.fqdir->timeout = IP_FRAG_TIME; net->ipv4.fqdir->max_dist = 64; res = ip4_frags_ns_ctl_register(net); if (res < 0) fqdir_exit(net->ipv4.fqdir); return res; } static void __net_exit ipv4_frags_pre_exit_net(struct net *net) { fqdir_pre_exit(net->ipv4.fqdir); } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); fqdir_exit(net->ipv4.fqdir); } static struct pernet_operations ip4_frags_ops = { .init = ipv4_frags_init_net, .pre_exit = ipv4_frags_pre_exit_net, .exit = ipv4_frags_exit_net, }; static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); } static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key.v4, sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); } static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_v4_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static const struct rhashtable_params ip4_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .key_offset = offsetof(struct inet_frag_queue, key), .key_len = sizeof(struct frag_v4_compare_key), .hashfn = ip4_key_hashfn, .obj_hashfn = ip4_obj_hashfn, .obj_cmpfn = ip4_obj_cmpfn, .automatic_shrinking = true, }; void __init ipfrag_init(void) { ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops); }
1 39 107 42 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UDF_DECL_H #define __UDF_DECL_H #define pr_fmt(fmt) "UDF-fs: " fmt #include "ecma_167.h" #include "osta_udf.h" #include <linux/fs.h> #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/udf_fs_i.h> #include "udf_sb.h" #include "udfend.h" #include "udf_i.h" #define UDF_DEFAULT_PREALLOC_BLOCKS 8 extern __printf(3, 4) void _udf_err(struct super_block *sb, const char *function, const char *fmt, ...); #define udf_err(sb, fmt, ...) \ _udf_err(sb, __func__, fmt, ##__VA_ARGS__) extern __printf(3, 4) void _udf_warn(struct super_block *sb, const char *function, const char *fmt, ...); #define udf_warn(sb, fmt, ...) \ _udf_warn(sb, __func__, fmt, ##__VA_ARGS__) #define udf_info(fmt, ...) \ pr_info("INFO " fmt, ##__VA_ARGS__) #define udf_debug(fmt, ...) \ pr_debug("%s:%d:%s: " fmt, __FILE__, __LINE__, __func__, ##__VA_ARGS__) #define UDF_EXTENT_LENGTH_MASK 0x3FFFFFFF #define UDF_EXTENT_FLAG_MASK 0xC0000000 #define UDF_INVALID_ID ((uint32_t)-1) #define UDF_NAME_PAD 4 #define UDF_NAME_LEN 254 #define UDF_NAME_LEN_CS0 255 static inline size_t udf_file_entry_alloc_offset(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_use) return sizeof(struct unallocSpaceEntry); else if (iinfo->i_efe) return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr; else return sizeof(struct fileEntry) + iinfo->i_lenEAttr; } static inline size_t udf_ext0_offset(struct inode *inode) { if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return udf_file_entry_alloc_offset(inode); else return 0; } /* computes tag checksum */ u8 udf_tag_checksum(const struct tag *t); typedef uint32_t udf_pblk_t; struct dentry; struct inode; struct task_struct; struct buffer_head; struct super_block; extern const struct export_operations udf_export_ops; extern const struct inode_operations udf_dir_inode_operations; extern const struct file_operations udf_dir_operations; extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; extern const struct inode_operations udf_symlink_inode_operations; extern const struct address_space_operations udf_aops; extern const struct address_space_operations udf_symlink_aops; struct udf_fileident_iter { struct inode *dir; /* Directory we are working with */ loff_t pos; /* Logical position in a dir */ struct buffer_head *bh[2]; /* Buffer containing 'pos' and possibly * next buffer if entry straddles * blocks */ struct kernel_lb_addr eloc; /* Start of extent containing 'pos' */ uint32_t elen; /* Length of extent containing 'pos' */ sector_t loffset; /* Block offset of 'pos' within above * extent */ struct extent_position epos; /* Position after the above extent */ struct fileIdentDesc fi; /* Copied directory entry */ uint8_t *name; /* Pointer to entry name */ uint8_t *namebuf; /* Storage for entry name in case * the name is split between two blocks */ }; struct udf_vds_record { uint32_t block; uint32_t volDescSeqNum; }; struct generic_desc { struct tag descTag; __le32 volDescSeqNum; }; /* super.c */ static inline void udf_updated_lvid(struct super_block *sb) { struct buffer_head *bh = UDF_SB(sb)->s_lvid_bh; BUG_ON(!bh); WARN_ON_ONCE(((struct logicalVolIntegrityDesc *) bh->b_data)->integrityType != cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN)); UDF_SB(sb)->s_lvid_dirty = 1; } extern u64 lvid_get_unique_id(struct super_block *sb); struct inode *udf_find_metadata_inode_efe(struct super_block *sb, u32 meta_file_loc, u32 partition_num); /* namei.c */ static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) { return ALIGN(sizeof(struct fileIdentDesc) + le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, UDF_NAME_PAD); } /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); /* inode.c */ extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *, bool hidden_inode); static inline struct inode *udf_iget_special(struct super_block *sb, struct kernel_lb_addr *ino) { return __udf_iget(sb, ino, true); } static inline struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) { return __udf_iget(sb, ino, false); } extern int udf_expand_file_adinicb(struct inode *); extern struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err); extern int udf_setsize(struct inode *, loff_t); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos); extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc); extern int udf_add_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern void udf_write_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern int8_t udf_delete_aext(struct inode *, struct extent_position); extern int8_t udf_next_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t *, int); extern int8_t udf_current_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t *, int); extern void udf_update_extra_perms(struct inode *inode, umode_t mode); /* misc.c */ extern struct genericFormat *udf_add_extendedattr(struct inode *, uint32_t, uint32_t, uint8_t); extern struct genericFormat *udf_get_extendedattr(struct inode *, uint32_t, uint8_t); extern struct buffer_head *udf_read_tagged(struct super_block *, uint32_t, uint32_t, uint16_t *); extern struct buffer_head *udf_read_ptagged(struct super_block *, struct kernel_lb_addr *, uint32_t, uint16_t *); extern void udf_update_tag(char *, int); extern void udf_new_tag(char *, uint16_t, uint16_t, uint16_t, uint32_t, int); /* lowlevel.c */ extern unsigned int udf_get_last_session(struct super_block *); udf_pblk_t udf_get_last_block(struct super_block *); /* partition.c */ extern uint32_t udf_get_pblock(struct super_block *, uint32_t, uint16_t, uint32_t); extern uint32_t udf_get_pblock_virt15(struct super_block *, uint32_t, uint16_t, uint32_t); extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t, uint32_t); extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t, uint32_t); extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t, uint32_t); extern int udf_relocate_blocks(struct super_block *, long, long *); static inline uint32_t udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc, uint32_t offset) { return udf_get_pblock(sb, loc->logicalBlockNum, loc->partitionReferenceNum, offset); } /* unicode.c */ extern int udf_get_filename(struct super_block *, const uint8_t *, int, uint8_t *, int); extern int udf_put_filename(struct super_block *, const uint8_t *, int, uint8_t *, int); extern int udf_dstrCS0toChar(struct super_block *, uint8_t *, int, const uint8_t *, int); /* ialloc.c */ extern void udf_free_inode(struct inode *); extern struct inode *udf_new_inode(struct inode *, umode_t); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); extern void udf_discard_prealloc(struct inode *); extern int udf_truncate_extents(struct inode *); /* balloc.c */ extern void udf_free_blocks(struct super_block *, struct inode *, struct kernel_lb_addr *, uint32_t, uint32_t); extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, uint32_t, uint32_t); extern udf_pblk_t udf_new_block(struct super_block *sb, struct inode *inode, uint16_t partition, uint32_t goal, int *err); /* directory.c */ int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, loff_t pos); int udf_fiiter_advance(struct udf_fileident_iter *iter); void udf_fiiter_release(struct udf_fileident_iter *iter); void udf_fiiter_write_fi(struct udf_fileident_iter *iter, uint8_t *impuse); void udf_fiiter_update_elen(struct udf_fileident_iter *iter, uint32_t new_elen); int udf_fiiter_append_blk(struct udf_fileident_iter *iter); extern struct long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); extern struct short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); /* udftime.c */ extern void udf_disk_stamp_to_time(struct timespec64 *dest, struct timestamp src); extern void udf_time_to_disk_stamp(struct timestamp *dest, struct timespec64 src); #endif /* __UDF_DECL_H */
1 46 44 47 13 1 12 46 47 12 12 47 47 15 3 41 44 12 12 12 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS recovery logic * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/crc32.h> #include "nilfs.h" #include "segment.h" #include "sufile.h" #include "page.h" #include "segbuf.h" /* * Segment check result */ enum { NILFS_SEG_VALID, NILFS_SEG_NO_SUPER_ROOT, NILFS_SEG_FAIL_IO, NILFS_SEG_FAIL_MAGIC, NILFS_SEG_FAIL_SEQ, NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, NILFS_SEG_FAIL_CHECKSUM_FULL, NILFS_SEG_FAIL_CONSISTENCY, }; /* work structure for recovery */ struct nilfs_recovery_block { ino_t ino; /* * Inode number of the file that this block * belongs to */ sector_t blocknr; /* block number */ __u64 vblocknr; /* virtual block number */ unsigned long blkoff; /* File offset of the data block (per block) */ struct list_head list; }; static int nilfs_warn_segment_error(struct super_block *sb, int err) { const char *msg = NULL; switch (err) { case NILFS_SEG_FAIL_IO: nilfs_err(sb, "I/O error reading segment"); return -EIO; case NILFS_SEG_FAIL_MAGIC: msg = "Magic number mismatch"; break; case NILFS_SEG_FAIL_SEQ: msg = "Sequence number mismatch"; break; case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: msg = "Checksum error in super root"; break; case NILFS_SEG_FAIL_CHECKSUM_FULL: msg = "Checksum error in segment payload"; break; case NILFS_SEG_FAIL_CONSISTENCY: msg = "Inconsistency found"; break; case NILFS_SEG_NO_SUPER_ROOT: msg = "No super root in the last segment"; break; default: nilfs_err(sb, "unrecognized segment error %d", err); return -EINVAL; } nilfs_warn(sb, "invalid segment: %s", msg); return -EINVAL; } /** * nilfs_compute_checksum - compute checksum of blocks continuously * @nilfs: nilfs object * @bhs: buffer head of start block * @sum: place to store result * @offset: offset bytes in the first block * @check_bytes: number of bytes to be checked * @start: DBN of start block * @nblock: number of blocks to be checked */ static int nilfs_compute_checksum(struct the_nilfs *nilfs, struct buffer_head *bhs, u32 *sum, unsigned long offset, u64 check_bytes, sector_t start, unsigned long nblock) { unsigned int blocksize = nilfs->ns_blocksize; unsigned long size; u32 crc; BUG_ON(offset >= blocksize); check_bytes -= offset; size = min_t(u64, check_bytes, blocksize - offset); crc = crc32_le(nilfs->ns_crc_seed, (unsigned char *)bhs->b_data + offset, size); if (--nblock > 0) { do { struct buffer_head *bh; bh = __bread(nilfs->ns_bdev, ++start, blocksize); if (!bh) return -EIO; check_bytes -= size; size = min_t(u64, check_bytes, blocksize); crc = crc32_le(crc, bh->b_data, size); brelse(bh); } while (--nblock > 0); } *sum = crc; return 0; } /** * nilfs_read_super_root_block - read super root block * @nilfs: nilfs object * @sr_block: disk block number of the super root block * @pbh: address of a buffer_head pointer to return super root buffer * @check: CRC check flag */ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, struct buffer_head **pbh, int check) { struct buffer_head *bh_sr; struct nilfs_super_root *sr; u32 crc; int ret; *pbh = NULL; bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize); if (unlikely(!bh_sr)) { ret = NILFS_SEG_FAIL_IO; goto failed; } sr = (struct nilfs_super_root *)bh_sr->b_data; if (check) { unsigned int bytes = le16_to_cpu(sr->sr_bytes); if (bytes == 0 || bytes > nilfs->ns_blocksize) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } if (nilfs_compute_checksum( nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes, sr_block, 1)) { ret = NILFS_SEG_FAIL_IO; goto failed_bh; } if (crc != le32_to_cpu(sr->sr_sum)) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } } *pbh = bh_sr; return 0; failed_bh: brelse(bh_sr); failed: return nilfs_warn_segment_error(nilfs->ns_sb, ret); } /** * nilfs_read_log_header - read summary header of the specified log * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: pointer to return segment summary structure */ static struct buffer_head * nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary **sum) { struct buffer_head *bh_sum; bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (bh_sum) *sum = (struct nilfs_segment_summary *)bh_sum->b_data; return bh_sum; } /** * nilfs_validate_log - verify consistency of log * @nilfs: nilfs object * @seg_seq: sequence number of segment * @bh_sum: buffer head of summary block * @sum: segment summary struct */ static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, struct buffer_head *bh_sum, struct nilfs_segment_summary *sum) { unsigned long nblock; u32 crc; int ret; ret = NILFS_SEG_FAIL_MAGIC; if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) goto out; ret = NILFS_SEG_FAIL_SEQ; if (le64_to_cpu(sum->ss_seq) != seg_seq) goto out; nblock = le32_to_cpu(sum->ss_nblocks); ret = NILFS_SEG_FAIL_CONSISTENCY; if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment)) /* This limits the number of blocks read in the CRC check */ goto out; ret = NILFS_SEG_FAIL_IO; if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum), ((u64)nblock << nilfs->ns_blocksize_bits), bh_sum->b_blocknr, nblock)) goto out; ret = NILFS_SEG_FAIL_CHECKSUM_FULL; if (crc != le32_to_cpu(sum->ss_datasum)) goto out; ret = 0; out: return ret; } /** * nilfs_read_summary_info - read an item on summary blocks of a log * @nilfs: nilfs object * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be read */ static void *nilfs_read_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, unsigned int *offset, unsigned int bytes) { void *ptr; sector_t blocknr; BUG_ON((*pbh)->b_size < *offset); if (bytes > (*pbh)->b_size - *offset) { blocknr = (*pbh)->b_blocknr; brelse(*pbh); *pbh = __bread(nilfs->ns_bdev, blocknr + 1, nilfs->ns_blocksize); if (unlikely(!*pbh)) return NULL; *offset = 0; } ptr = (*pbh)->b_data + *offset; *offset += bytes; return ptr; } /** * nilfs_skip_summary_info - skip items on summary blocks of a log * @nilfs: nilfs object * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be skipped * @count: number of items to be skipped */ static void nilfs_skip_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, unsigned int *offset, unsigned int bytes, unsigned long count) { unsigned int rest_item_in_current_block = ((*pbh)->b_size - *offset) / bytes; if (count <= rest_item_in_current_block) { *offset += bytes * count; } else { sector_t blocknr = (*pbh)->b_blocknr; unsigned int nitem_per_block = (*pbh)->b_size / bytes; unsigned int bcnt; count -= rest_item_in_current_block; bcnt = DIV_ROUND_UP(count, nitem_per_block); *offset = bytes * (count - (bcnt - 1) * nitem_per_block); brelse(*pbh); *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt, nilfs->ns_blocksize); } } /** * nilfs_scan_dsync_log - get block information of a log written for data sync * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: log summary information * @head: list head to add nilfs_recovery_block struct */ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary *sum, struct list_head *head) { struct buffer_head *bh; unsigned int offset; u32 nfinfo, sumbytes; sector_t blocknr; ino_t ino; int err = -EIO; nfinfo = le32_to_cpu(sum->ss_nfinfo); if (!nfinfo) return 0; sumbytes = le32_to_cpu(sum->ss_sumbytes); blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize); bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (unlikely(!bh)) goto out; offset = le16_to_cpu(sum->ss_bytes); for (;;) { unsigned long nblocks, ndatablk, nnodeblk; struct nilfs_finfo *finfo; finfo = nilfs_read_summary_info(nilfs, &bh, &offset, sizeof(*finfo)); if (unlikely(!finfo)) goto out; ino = le64_to_cpu(finfo->fi_ino); nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); nnodeblk = nblocks - ndatablk; while (ndatablk-- > 0) { struct nilfs_recovery_block *rb; struct nilfs_binfo_v *binfo; binfo = nilfs_read_summary_info(nilfs, &bh, &offset, sizeof(*binfo)); if (unlikely(!binfo)) goto out; rb = kmalloc(sizeof(*rb), GFP_NOFS); if (unlikely(!rb)) { err = -ENOMEM; goto out; } rb->ino = ino; rb->blocknr = blocknr++; rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); rb->blkoff = le64_to_cpu(binfo->bi_blkoff); /* INIT_LIST_HEAD(&rb->list); */ list_add_tail(&rb->list, head); } if (--nfinfo == 0) break; blocknr += nnodeblk; /* always 0 for data sync logs */ nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64), nnodeblk); if (unlikely(!bh)) goto out; } err = 0; out: brelse(bh); /* brelse(NULL) is just ignored */ return err; } static void dispose_recovery_list(struct list_head *head) { while (!list_empty(head)) { struct nilfs_recovery_block *rb; rb = list_first_entry(head, struct nilfs_recovery_block, list); list_del(&rb->list); kfree(rb); } } struct nilfs_segment_entry { struct list_head list; __u64 segnum; }; static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) { struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); if (unlikely(!ent)) return -ENOMEM; ent->segnum = segnum; INIT_LIST_HEAD(&ent->list); list_add_tail(&ent->list, head); return 0; } void nilfs_dispose_segment_list(struct list_head *head) { while (!list_empty(head)) { struct nilfs_segment_entry *ent; ent = list_first_entry(head, struct nilfs_segment_entry, list); list_del(&ent->list); kfree(ent); } } static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_recovery_info *ri) { struct list_head *head = &ri->ri_used_segments; struct nilfs_segment_entry *ent, *n; struct inode *sufile = nilfs->ns_sufile; __u64 segnum[4]; int err; int i; segnum[0] = nilfs->ns_segnum; segnum[1] = nilfs->ns_nextnum; segnum[2] = ri->ri_segnum; segnum[3] = ri->ri_nextnum; /* * Releasing the next segment of the latest super root. * The next segment is invalidated by this recovery. */ err = nilfs_sufile_free(sufile, segnum[1]); if (unlikely(err)) goto failed; for (i = 1; i < 4; i++) { err = nilfs_segment_list_add(head, segnum[i]); if (unlikely(err)) goto failed; } /* * Collecting segments written after the latest super root. * These are marked dirty to avoid being reallocated in the next write. */ list_for_each_entry_safe(ent, n, head, list) { if (ent->segnum != segnum[0]) { err = nilfs_sufile_scrap(sufile, ent->segnum); if (unlikely(err)) goto failed; } list_del(&ent->list); kfree(ent); } /* Allocate new segments for recovery */ err = nilfs_sufile_alloc(sufile, &segnum[0]); if (unlikely(err)) goto failed; nilfs->ns_pseg_offset = 0; nilfs->ns_seg_seq = ri->ri_seq + 2; nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; failed: /* No need to recover sufile because it will be destroyed on error */ return err; } static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, struct page *page) { struct buffer_head *bh_org; void *kaddr; bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); if (unlikely(!bh_org)) return -EIO; kaddr = kmap_atomic(page); memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); kunmap_atomic(kaddr); brelse(bh_org); return 0; } static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct list_head *head, unsigned long *nr_salvaged_blocks) { struct inode *inode; struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; struct page *page; loff_t pos; int err = 0, err2 = 0; list_for_each_entry_safe(rb, n, head, list) { inode = nilfs_iget(sb, root, rb->ino); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto failed_inode; } pos = rb->blkoff << inode->i_blkbits; err = block_write_begin(inode->i_mapping, pos, blocksize, &page, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; if (pos + blocksize > isize) nilfs_write_failed(inode->i_mapping, pos + blocksize); goto failed_inode; } err = nilfs_recovery_copy_block(nilfs, rb, page); if (unlikely(err)) goto failed_page; err = nilfs_set_file_dirty(inode, 1); if (unlikely(err)) goto failed_page; block_write_end(NULL, inode->i_mapping, pos, blocksize, blocksize, page, NULL); unlock_page(page); put_page(page); (*nr_salvaged_blocks)++; goto next; failed_page: unlock_page(page); put_page(page); failed_inode: nilfs_warn(sb, "error %d recovering data block (ino=%lu, block-offset=%llu)", err, (unsigned long)rb->ino, (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: iput(inode); /* iput(NULL) is just ignored */ list_del_init(&rb->list); kfree(rb); } return err2; } /** * nilfs_do_roll_forward - salvage logical segments newer than the latest * checkpoint * @nilfs: nilfs object * @sb: super block instance * @ri: pointer to a nilfs_recovery_info */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; struct nilfs_segment_summary *sum = NULL; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; int empty_seg = 0; int err = 0, ret; LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ enum { RF_INIT_ST, RF_DSYNC_ST, /* scanning data-sync segments */ }; int state = RF_INIT_ST; pseg_start = ri->ri_lsegs_start; seg_seq = ri->ri_lsegs_start_seq; segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { brelse(bh_sum); bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); if (!bh_sum) { err = -EIO; goto failed; } ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) { err = -EIO; goto failed; } goto strayed; } flags = le16_to_cpu(sum->ss_flags); if (flags & NILFS_SS_SR) goto confused; /* Found a valid partial segment; do recovery actions */ nextnum = nilfs_get_segnum_of_block(nilfs, le64_to_cpu(sum->ss_next)); empty_seg = 0; nilfs->ns_ctime = le64_to_cpu(sum->ss_create); if (!(flags & NILFS_SS_GC)) nilfs->ns_nongc_ctime = nilfs->ns_ctime; switch (state) { case RF_INIT_ST: if (!(flags & NILFS_SS_LOGBGN) || !(flags & NILFS_SS_SYNDT)) goto try_next_pseg; state = RF_DSYNC_ST; fallthrough; case RF_DSYNC_ST: if (!(flags & NILFS_SS_SYNDT)) goto confused; err = nilfs_scan_dsync_log(nilfs, pseg_start, sum, &dsync_blocks); if (unlikely(err)) goto failed; if (flags & NILFS_SS_LOGEND) { err = nilfs_recover_dsync_blocks( nilfs, sb, root, &dsync_blocks, &nsalvaged_blocks); if (unlikely(err)) goto failed; state = RF_INIT_ST; } break; /* Fall through to try_next_pseg */ } try_next_pseg: if (pseg_start == ri->ri_lsegs_end) break; pseg_start += le32_to_cpu(sum->ss_nblocks); if (pseg_start < seg_end) continue; goto feed_segment; strayed: if (pseg_start == ri->ri_lsegs_end) break; feed_segment: /* Looking to the next full segment */ if (empty_seg++) break; seg_seq++; segnum = nextnum; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); pseg_start = seg_start; } if (nsalvaged_blocks) { nilfs_info(sb, "salvaged %lu blocks", nsalvaged_blocks); ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; } out: brelse(bh_sum); dispose_recovery_list(&dsync_blocks); return err; confused: err = -EINVAL; failed: nilfs_err(sb, "error %d roll-forwarding partial segment at blocknr = %llu", err, (unsigned long long)pseg_start); goto out; } static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh; int err; if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) return; bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); BUG_ON(!bh); memset(bh->b_data, 0, bh->b_size); set_buffer_dirty(bh); err = sync_dirty_buffer(bh); if (unlikely(err)) nilfs_warn(nilfs->ns_sb, "buffer sync write failed during post-cleaning of recovery."); brelse(bh); } /** * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint * @nilfs: nilfs object * @sb: super block instance * @ri: pointer to a nilfs_recovery_info struct to store search results. * * Return Value: On success, 0 is returned. On error, one of the following * negative error code is returned. * * %-EINVAL - Inconsistent filesystem state. * * %-EIO - I/O error * * %-ENOSPC - No space left on device (only in a panic state). * * %-ERESTARTSYS - Interrupted. * * %-ENOMEM - Insufficient memory available. */ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_recovery_info *ri) { struct nilfs_root *root; int err; if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) return 0; err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root); if (unlikely(err)) { nilfs_err(sb, "error %d loading the latest checkpoint", err); return err; } err = nilfs_do_roll_forward(nilfs, sb, root, ri); if (unlikely(err)) goto failed; if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri); if (unlikely(err)) { nilfs_err(sb, "error %d preparing segment for recovery", err); goto failed; } err = nilfs_attach_log_writer(sb, root); if (unlikely(err)) goto failed; set_nilfs_discontinued(nilfs); err = nilfs_construct_segment(sb); nilfs_detach_log_writer(sb); if (unlikely(err)) { nilfs_err(sb, "error %d writing segment for recovery", err); goto failed; } nilfs_finish_roll_forward(nilfs, ri); } failed: nilfs_put_root(root); return err; } /** * nilfs_search_super_root - search the latest valid super root * @nilfs: the_nilfs * @ri: pointer to a nilfs_recovery_info struct to store search results. * * nilfs_search_super_root() looks for the latest super-root from a partial * segment pointed by the superblock. It sets up struct the_nilfs through * this search. It fills nilfs_recovery_info (ri) required for recovery. * * Return Value: On success, 0 is returned. On error, one of the following * negative error code is returned. * * %-EINVAL - No valid segment found * * %-EIO - I/O error * * %-ENOMEM - Insufficient memory available. */ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; struct nilfs_segment_summary *sum = NULL; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; unsigned long nblocks; unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; LIST_HEAD(segments); int empty_seg = 0, scan_newer = 0; int ret; pseg_start = nilfs->ns_last_pseg; seg_seq = nilfs->ns_last_seq; cno = nilfs->ns_last_cno; segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); /* Calculate range of segment */ nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); /* Read ahead segment */ b = seg_start; while (b <= seg_end) __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); for (;;) { brelse(bh_sum); ret = NILFS_SEG_FAIL_IO; bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); if (!bh_sum) goto failed; ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) goto failed; goto strayed; } nblocks = le32_to_cpu(sum->ss_nblocks); pseg_end = pseg_start + nblocks - 1; if (unlikely(pseg_end > seg_end)) { ret = NILFS_SEG_FAIL_CONSISTENCY; goto strayed; } /* A valid partial segment */ ri->ri_pseg_start = pseg_start; ri->ri_seq = seg_seq; ri->ri_segnum = segnum; nextnum = nilfs_get_segnum_of_block(nilfs, le64_to_cpu(sum->ss_next)); ri->ri_nextnum = nextnum; empty_seg = 0; flags = le16_to_cpu(sum->ss_flags); if (!(flags & NILFS_SS_SR) && !scan_newer) { /* * This will never happen because a superblock * (last_segment) always points to a pseg with * a super root. */ ret = NILFS_SEG_FAIL_CONSISTENCY; goto failed; } if (pseg_start == seg_start) { nilfs_get_segment_range(nilfs, nextnum, &b, &end); while (b <= end) __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); } if (!(flags & NILFS_SS_SR)) { if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) { ri->ri_lsegs_start = pseg_start; ri->ri_lsegs_start_seq = seg_seq; } if (flags & NILFS_SS_LOGEND) ri->ri_lsegs_end = pseg_start; goto try_next_pseg; } /* A valid super root was found. */ ri->ri_cno = cno++; ri->ri_super_root = pseg_end; ri->ri_lsegs_start = ri->ri_lsegs_end = 0; nilfs_dispose_segment_list(&segments); sr_pseg_start = pseg_start; nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start; nilfs->ns_seg_seq = seg_seq; nilfs->ns_segnum = segnum; nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ nilfs->ns_ctime = le64_to_cpu(sum->ss_create); nilfs->ns_nextnum = nextnum; if (scan_newer) ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; else { if (nilfs->ns_mount_state & NILFS_VALID_FS) goto super_root_found; scan_newer = 1; } try_next_pseg: /* Standing on a course, or met an inconsistent state */ pseg_start += nblocks; if (pseg_start < seg_end) continue; goto feed_segment; strayed: /* Off the trail */ if (!scan_newer) /* * This can happen if a checkpoint was written without * barriers, or as a result of an I/O failure. */ goto failed; feed_segment: /* Looking to the next full segment */ if (empty_seg++) goto super_root_found; /* found a valid super root */ ret = nilfs_segment_list_add(&segments, segnum); if (unlikely(ret)) goto failed; seg_seq++; segnum = nextnum; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); pseg_start = seg_start; } super_root_found: /* Updating pointers relating to the latest checkpoint */ brelse(bh_sum); list_splice_tail(&segments, &ri->ri_used_segments); nilfs->ns_last_pseg = sr_pseg_start; nilfs->ns_last_seq = nilfs->ns_seg_seq; nilfs->ns_last_cno = ri->ri_cno; return 0; failed: brelse(bh_sum); nilfs_dispose_segment_list(&segments); return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret); }
1 9 9 9 8 1 1 1 5 5 3 3 3 2 61 61 60 60 3 2 1 3 3 33 33 109 3 3 1 2 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 // SPDX-License-Identifier: GPL-2.0-only /* * KVM Microsoft Hyper-V emulation * * derived from arch/x86/kvm/x86.c * * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com> * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "x86.h" #include "lapic.h" #include "ioapic.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" #include "xen.h" #include <linux/cpu.h> #include <linux/kvm_host.h> #include <linux/highmem.h> #include <linux/sched/cputime.h> #include <linux/spinlock.h> #include <linux/eventfd.h> #include <asm/apicdef.h> #include <asm/mshyperv.h> #include <trace/events/kvm.h> #include "trace.h" #include "irq.h" #include "fpu.h" #define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK) /* * As per Hyper-V TLFS, extended hypercalls start from 0x8001 * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value * where each bit tells which extended hypercall is available besides * HvExtCallQueryCapabilities. * * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit * assigned. * * 0x8002 - Bit 0 * 0x8003 - Bit 1 * .. * 0x8041 - Bit 63 * * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64 */ #define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); } static inline int synic_get_sint_vector(u64 sint_value) { if (sint_value & HV_SYNIC_SINT_MASKED) return -1; return sint_value & HV_SYNIC_SINT_VECTOR_MASK; } static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic, int vector) { int i; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) return true; } return false; } static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, int vector) { int i; u64 sint_value; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { sint_value = synic_read_sint(synic, i); if (synic_get_sint_vector(sint_value) == vector && sint_value & HV_SYNIC_SINT_AUTO_EOI) return true; } return false; } static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); bool auto_eoi_old, auto_eoi_new; if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; if (synic_has_vector_connected(synic, vector)) __set_bit(vector, synic->vec_bitmap); else __clear_bit(vector, synic->vec_bitmap); auto_eoi_old = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); auto_eoi_new = !bitmap_empty(synic->auto_eoi_bitmap, 256); if (auto_eoi_old == auto_eoi_new) return; if (!enable_apicv) return; down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) hv->synic_auto_eoi_used++; else hv->synic_auto_eoi_used--; /* * Inhibit APICv if any vCPU is using SynIC's AutoEOI, which relies on * the hypervisor to manually inject IRQs. */ __kvm_set_or_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_HYPERV, !!hv->synic_auto_eoi_used); up_write(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, u64 data, bool host) { int vector, old_vector; bool masked; vector = data & HV_SYNIC_SINT_VECTOR_MASK; masked = data & HV_SYNIC_SINT_MASKED; /* * Valid vectors are 16-255, however, nested Hyper-V attempts to write * default '0x10000' value on boot and this should not #GP. We need to * allow zero-initing the register from host as well. */ if (vector < HV_SYNIC_FIRST_VALID_VECTOR && !host && !masked) return 1; /* * Guest may configure multiple SINTs to use the same vector, so * we maintain a bitmap of vectors handled by synic, and a * bitmap of vectors with auto-eoi behavior. The bitmaps are * updated here, and atomically queried on fast paths. */ old_vector = synic_read_sint(synic, sint) & HV_SYNIC_SINT_VECTOR_MASK; atomic64_set(&synic->sint[sint], data); synic_update_vector(synic, old_vector); synic_update_vector(synic, vector); /* Load SynIC vectors into EOI exit bitmap */ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic)); return 0; } static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu = NULL; unsigned long i; if (vpidx >= KVM_MAX_VCPUS) return NULL; vcpu = kvm_get_vcpu(kvm, vpidx); if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_hv_get_vpindex(vcpu) == vpidx) return vcpu; return NULL; } static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu || !to_hv_vcpu(vcpu)) return NULL; synic = to_hv_synic(vcpu); return (synic->active) ? synic : NULL; } static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint) { struct kvm *kvm = vcpu->kvm; struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; int gsi, idx; trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint); /* Try to deliver pending Hyper-V SynIC timers messages */ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) { stimer = &hv_vcpu->stimer[idx]; if (stimer->msg_pending && stimer->config.enable && !stimer->config.direct_mode && stimer->config.sintx == sint) stimer_mark_pending(stimer, false); } idx = srcu_read_lock(&kvm->irq_srcu); gsi = atomic_read(&synic->sint_to_gsi[sint]); if (gsi != -1) kvm_notify_acked_gsi(kvm, gsi); srcu_read_unlock(&kvm->irq_srcu, idx); } static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC; hv_vcpu->exit.u.synic.msr = msr; hv_vcpu->exit.u.synic.control = synic->control; hv_vcpu->exit.u.synic.evt_page = synic->evt_page; hv_vcpu->exit.u.synic.msg_page = synic->msg_page; kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 data, bool host) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int ret; if (!synic->active && (!host || data)) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); ret = 0; switch (msr) { case HV_X64_MSR_SCONTROL: synic->control = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_SVERSION: if (!host) { ret = 1; break; } synic->version = data; break; case HV_X64_MSR_SIEFP: if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; break; } synic->evt_page = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: if ((data & HV_SYNIC_SIMP_ENABLE) && !host && !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; break; } synic->msg_page = data; if (!host) synic_exit(synic, msr); break; case HV_X64_MSR_EOM: { int i; if (!synic->active) break; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) kvm_hv_notify_acked_sint(vcpu, i); break; } case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host); break; default: ret = 1; break; } return ret; } static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); return hv_vcpu->cpuid_cache.syndbg_cap_eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; } static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) hv->hv_syndbg.control.status = vcpu->run->hyperv.u.syndbg.status; return 1; } static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; hv_vcpu->exit.u.syndbg.msr = msr; hv_vcpu->exit.u.syndbg.control = syndbg->control.control; hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; vcpu->arch.complete_userspace_io = kvm_hv_syndbg_complete_userspace; kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, to_hv_vcpu(vcpu)->vp_index, msr, data); switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: syndbg->control.control = data; if (!host) syndbg_exit(vcpu, msr); break; case HV_X64_MSR_SYNDBG_STATUS: syndbg->control.status = data; break; case HV_X64_MSR_SYNDBG_SEND_BUFFER: syndbg->control.send_page = data; break; case HV_X64_MSR_SYNDBG_RECV_BUFFER: syndbg->control.recv_page = data; break; case HV_X64_MSR_SYNDBG_PENDING_BUFFER: syndbg->control.pending_page = data; if (!host) syndbg_exit(vcpu, msr); break; case HV_X64_MSR_SYNDBG_OPTIONS: syndbg->options = data; break; default: break; } return 0; } static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) return 1; switch (msr) { case HV_X64_MSR_SYNDBG_CONTROL: *pdata = syndbg->control.control; break; case HV_X64_MSR_SYNDBG_STATUS: *pdata = syndbg->control.status; break; case HV_X64_MSR_SYNDBG_SEND_BUFFER: *pdata = syndbg->control.send_page; break; case HV_X64_MSR_SYNDBG_RECV_BUFFER: *pdata = syndbg->control.recv_page; break; case HV_X64_MSR_SYNDBG_PENDING_BUFFER: *pdata = syndbg->control.pending_page; break; case HV_X64_MSR_SYNDBG_OPTIONS: *pdata = syndbg->options; break; default: break; } trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata); return 0; } static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { int ret; if (!synic->active && !host) return 1; ret = 0; switch (msr) { case HV_X64_MSR_SCONTROL: *pdata = synic->control; break; case HV_X64_MSR_SVERSION: *pdata = synic->version; break; case HV_X64_MSR_SIEFP: *pdata = synic->evt_page; break; case HV_X64_MSR_SIMP: *pdata = synic->msg_page; break; case HV_X64_MSR_EOM: *pdata = 0; break; case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]); break; default: ret = 1; break; } return ret; } static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); struct kvm_lapic_irq irq; int ret, vector; if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) return -EINVAL; if (sint >= ARRAY_SIZE(synic->sint)) return -EINVAL; vector = synic_get_sint_vector(synic_read_sint(synic, sint)); if (vector < 0) return -ENOENT; memset(&irq, 0, sizeof(irq)); irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) { struct kvm_vcpu_hv_synic *synic; synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; return synic_set_irq(synic, sint); } void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) { struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); int i; trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector); for (i = 0; i < ARRAY_SIZE(synic->sint); i++) if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector) kvm_hv_notify_acked_sint(vcpu, i); } static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; if (sint >= ARRAY_SIZE(synic->sint_to_gsi)) return -EINVAL; atomic_set(&synic->sint_to_gsi[sint], gsi); return 0; } void kvm_hv_irq_routing_update(struct kvm *kvm) { struct kvm_irq_routing_table *irq_rt; struct kvm_kernel_irq_routing_entry *e; u32 gsi; irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, lockdep_is_held(&kvm->irq_lock)); for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) { hlist_for_each_entry(e, &irq_rt->map[gsi], link) { if (e->type == KVM_IRQ_ROUTING_HV_SINT) kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu, e->hv_sint.sint, gsi); } } } static void synic_init(struct kvm_vcpu_hv_synic *synic) { int i; memset(synic, 0, sizeof(*synic)); synic->version = HV_SYNIC_VERSION_1; for (i = 0; i < ARRAY_SIZE(synic->sint); i++) { atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED); atomic_set(&synic->sint_to_gsi[i], -1); } } static u64 get_time_ref_counter(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); struct kvm_vcpu *vcpu; u64 tsc; /* * Fall back to get_kvmclock_ns() when TSC page hasn't been set up, * is broken, disabled or being updated. */ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET) return div_u64(get_kvmclock_ns(kvm), 100); vcpu = kvm_get_vcpu(kvm, 0); tsc = kvm_read_l1_tsc(vcpu, rdtsc()); return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64) + hv->tsc_ref.tsc_offset; } static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); set_bit(stimer->index, to_hv_vcpu(vcpu)->stimer_pending_bitmap); kvm_make_request(KVM_REQ_HV_STIMER, vcpu); if (vcpu_kick) kvm_vcpu_kick(vcpu); } static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); hrtimer_cancel(&stimer->timer); clear_bit(stimer->index, to_hv_vcpu(vcpu)->stimer_pending_bitmap); stimer->msg_pending = false; stimer->exp_time = 0; } static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer) { struct kvm_vcpu_hv_stimer *stimer; stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer); trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index); stimer_mark_pending(stimer, true); return HRTIMER_NORESTART; } /* * stimer_start() assumptions: * a) stimer->count is not equal to 0 * b) stimer->config has HV_STIMER_ENABLE flag */ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer) { u64 time_now; ktime_t ktime_now; time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm); ktime_now = ktime_get(); if (stimer->config.periodic) { if (stimer->exp_time) { if (time_now >= stimer->exp_time) { u64 remainder; div64_u64_rem(time_now - stimer->exp_time, stimer->count, &remainder); stimer->exp_time = time_now + (stimer->count - remainder); } } else stimer->exp_time = time_now + stimer->count; trace_kvm_hv_stimer_start_periodic( hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->exp_time); hrtimer_start(&stimer->timer, ktime_add_ns(ktime_now, 100 * (stimer->exp_time - time_now)), HRTIMER_MODE_ABS); return 0; } stimer->exp_time = stimer->count; if (time_now >= stimer->count) { /* * Expire timer according to Hypervisor Top-Level Functional * specification v4(15.3.1): * "If a one shot is enabled and the specified count is in * the past, it will expire immediately." */ stimer_mark_pending(stimer, false); return 0; } trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, time_now, stimer->count); hrtimer_start(&stimer->timer, ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)), HRTIMER_MODE_ABS); return 0; } static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config, bool host) { union hv_stimer_config new_config = {.as_uint64 = config}, old_config = {.as_uint64 = stimer->config.as_uint64}; struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && (!host || config)) return 1; if (unlikely(!host && hv_vcpu->enforce_cpuid && new_config.direct_mode && !(hv_vcpu->cpuid_cache.features_edx & HV_STIMER_DIRECT_MODE_AVAILABLE))) return 1; trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, config, host); stimer_cleanup(stimer); if (old_config.enable && !new_config.direct_mode && new_config.sintx == 0) new_config.enable = 0; stimer->config.as_uint64 = new_config.as_uint64; if (stimer->config.enable) stimer_mark_pending(stimer, false); return 0; } static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count, bool host) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu); if (!synic->active && (!host || count)) return 1; trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, count, host); stimer_cleanup(stimer); stimer->count = count; if (!host) { if (stimer->count == 0) stimer->config.enable = 0; else if (stimer->config.auto_enable) stimer->config.enable = 1; } if (stimer->config.enable) stimer_mark_pending(stimer, false); return 0; } static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig) { *pconfig = stimer->config.as_uint64; return 0; } static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount) { *pcount = stimer->count; return 0; } static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint, struct hv_message *src_msg, bool no_retry) { struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); int msg_off = offsetof(struct hv_message_page, sint_message[sint]); gfn_t msg_page_gfn; struct hv_message_header hv_hdr; int r; if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE)) return -ENOENT; msg_page_gfn = synic->msg_page >> PAGE_SHIFT; /* * Strictly following the spec-mandated ordering would assume setting * .msg_pending before checking .message_type. However, this function * is only called in vcpu context so the entire update is atomic from * guest POV and thus the exact order here doesn't matter. */ r = kvm_vcpu_read_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_type, msg_off + offsetof(struct hv_message, header.message_type), sizeof(hv_hdr.message_type)); if (r < 0) return r; if (hv_hdr.message_type != HVMSG_NONE) { if (no_retry) return 0; hv_hdr.message_flags.msg_pending = 1; r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, &hv_hdr.message_flags, msg_off + offsetof(struct hv_message, header.message_flags), sizeof(hv_hdr.message_flags)); if (r < 0) return r; return -EAGAIN; } r = kvm_vcpu_write_guest_page(vcpu, msg_page_gfn, src_msg, msg_off, sizeof(src_msg->header) + src_msg->header.payload_size); if (r < 0) return r; r = synic_set_irq(synic, sint); if (r < 0) return r; if (r == 0) return -EFAULT; return 0; } static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; /* * To avoid piling up periodic ticks, don't retry message * delivery for them (within "lazy" lost ticks policy). */ bool no_retry = stimer->config.periodic; payload->expiration_time = stimer->exp_time; payload->delivery_time = get_time_ref_counter(vcpu->kvm); return synic_deliver_msg(to_hv_synic(vcpu), stimer->config.sintx, msg, no_retry); } static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer) { struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer); struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = stimer->config.apic_vector }; if (lapic_in_kernel(vcpu)) return !kvm_apic_set_irq(vcpu, &irq, NULL); return 0; } static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer) { int r, direct = stimer->config.direct_mode; stimer->msg_pending = true; if (!direct) r = stimer_send_msg(stimer); else r = stimer_notify_direct(stimer); trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id, stimer->index, direct, r); if (!r) { stimer->msg_pending = false; if (!(stimer->config.periodic)) stimer->config.enable = 0; } } void kvm_hv_process_stimers(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_vcpu_hv_stimer *stimer; u64 time_now, exp_time; int i; if (!hv_vcpu) return; for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) { stimer = &hv_vcpu->stimer[i]; if (stimer->config.enable) { exp_time = stimer->exp_time; if (exp_time) { time_now = get_time_ref_counter(vcpu->kvm); if (time_now >= exp_time) stimer_expiration(stimer); } if ((stimer->config.enable) && stimer->count) { if (!stimer->msg_pending) stimer_start(stimer); } else stimer_cleanup(stimer); } } } void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; if (!hv_vcpu) return; for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_cleanup(&hv_vcpu->stimer[i]); kfree(hv_vcpu); vcpu->arch.hyperv = NULL; } bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu) return false; if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) return false; return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; } EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu)) return -EFAULT; return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page)); } EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { struct hv_message *msg = &stimer->msg; struct hv_timer_message_payload *payload = (struct hv_timer_message_payload *)&msg->u.payload; memset(&msg->header, 0, sizeof(msg->header)); msg->header.message_type = HVMSG_TIMER_EXPIRED; msg->header.payload_size = sizeof(*payload); payload->timer_index = stimer->index; payload->expiration_time = 0; payload->delivery_time = 0; } static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index) { memset(stimer, 0, sizeof(*stimer)); stimer->index = timer_index; hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); stimer->timer.function = stimer_timer_callback; stimer_prepare_msg(stimer); } int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); int i; if (hv_vcpu) return 0; hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT); if (!hv_vcpu) return -ENOMEM; vcpu->arch.hyperv = hv_vcpu; hv_vcpu->vcpu = vcpu; synic_init(&hv_vcpu->synic); bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++) stimer_init(&hv_vcpu->stimer[i], i); hv_vcpu->vp_index = vcpu->vcpu_idx; for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) { INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries); spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock); } return 0; } int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { struct kvm_vcpu_hv_synic *synic; int r; r = kvm_hv_vcpu_init(vcpu); if (r) return r; synic = to_hv_synic(vcpu); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; return 0; } static bool kvm_hv_msr_partition_wide(u32 msr) { bool r = false; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: case HV_X64_MSR_REFERENCE_TSC: case HV_X64_MSR_TIME_REF_COUNT: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_RESET: case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: case HV_X64_MSR_TSC_INVARIANT_CONTROL: case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } return r; } static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata) { struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) return -EINVAL; *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; return 0; } static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata) { struct kvm_hv *hv = to_kvm_hv(kvm); *pdata = hv->hv_crash_ctl; return 0; } static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data) { struct kvm_hv *hv = to_kvm_hv(kvm); hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY; return 0; } static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data) { struct kvm_hv *hv = to_kvm_hv(kvm); size_t size = ARRAY_SIZE(hv->hv_crash_param); if (WARN_ON_ONCE(index >= size)) return -EINVAL; hv->hv_crash_param[array_index_nospec(index, size)] = data; return 0; } /* * The kvmclock and Hyper-V TSC page use similar formulas, and converting * between them is possible: * * kvmclock formula: * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32) * + system_time * * Hyper-V formula: * nsec/100 = ticks * scale / 2^64 + offset * * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula. * By dividing the kvmclock formula by 100 and equating what's left we get: * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100 * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100 * * Now expand the kvmclock formula and divide by 100: * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32) * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) * + system_time * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100 * + system_time / 100 * * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64: * nsec/100 = ticks * scale / 2^64 * - tsc_timestamp * scale / 2^64 * + system_time / 100 * * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out: * offset = system_time / 100 - tsc_timestamp * scale / 2^64 * * These two equivalencies are implemented in this function. */ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, struct ms_hyperv_tsc_page *tsc_ref) { u64 max_mul; if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT)) return false; /* * check if scale would overflow, if so we use the time ref counter * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64 * tsc_to_system_mul / 100 >= 2^(32-tsc_shift) * tsc_to_system_mul >= 100 * 2^(32-tsc_shift) */ max_mul = 100ull << (32 - hv_clock->tsc_shift); if (hv_clock->tsc_to_system_mul >= max_mul) return false; /* * Otherwise compute the scale and offset according to the formulas * derived above. */ tsc_ref->tsc_scale = mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift), hv_clock->tsc_to_system_mul, 100); tsc_ref->tsc_offset = hv_clock->system_time; do_div(tsc_ref->tsc_offset, 100); tsc_ref->tsc_offset -= mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64); return true; } /* * Don't touch TSC page values if the guest has opted for TSC emulation after * migration. KVM doesn't fully support reenlightenment notifications and TSC * access emulation and Hyper-V is known to expect the values in TSC page to * stay constant before TSC access emulation is disabled from guest side * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC * frequency and guest visible TSC value across migration (and prevent it when * TSC scaling is unsupported). */ static inline bool tsc_page_update_unsafe(struct kvm_hv *hv) { return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) && hv->hv_tsc_emulation_control; } void kvm_hv_setup_tsc_page(struct kvm *kvm, struct pvclock_vcpu_time_info *hv_clock) { struct kvm_hv *hv = to_kvm_hv(kvm); u32 tsc_seq; u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); mutex_lock(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN || hv->hv_tsc_page_status == HV_TSC_PAGE_SET || hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET) goto out_unlock; if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) goto out_unlock; gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* * Because the TSC parameters only vary when there is a * change in the master clock, do not bother with caching. */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) goto out_err; if (tsc_seq && tsc_page_update_unsafe(hv)) { if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; goto out_unlock; } /* * While we're computing and writing the parameters, force the * guest to use the time reference count MSR. */ hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) goto out_err; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) goto out_err; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) goto out_err; /* * Now switch to the TSC page mechanism by writing the sequence. */ tsc_seq++; if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0) tsc_seq = 1; /* Write the struct entirely before the non-zero sequence. */ smp_wmb(); hv->tsc_ref.tsc_sequence = tsc_seq; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) goto out_err; hv->hv_tsc_page_status = HV_TSC_PAGE_SET; goto out_unlock; out_err: hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN; out_unlock: mutex_unlock(&hv->hv_lock); } void kvm_hv_request_tsc_page_update(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); mutex_lock(&hv->hv_lock); if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET && !tsc_page_update_unsafe(hv)) hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; mutex_unlock(&hv->hv_lock); } static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr) { if (!hv_vcpu->enforce_cpuid) return true; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: case HV_X64_MSR_HYPERCALL: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_HYPERCALL_AVAILABLE; case HV_X64_MSR_VP_RUNTIME: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_RUNTIME_AVAILABLE; case HV_X64_MSR_TIME_REF_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_TIME_REF_COUNT_AVAILABLE; case HV_X64_MSR_VP_INDEX: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_VP_INDEX_AVAILABLE; case HV_X64_MSR_RESET: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_RESET_AVAILABLE; case HV_X64_MSR_REFERENCE_TSC: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_REFERENCE_TSC_AVAILABLE; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNIC_AVAILABLE; case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_SYNTIMER_AVAILABLE; case HV_X64_MSR_EOI: case HV_X64_MSR_ICR: case HV_X64_MSR_TPR: case HV_X64_MSR_VP_ASSIST_PAGE: return hv_vcpu->cpuid_cache.features_eax & HV_MSR_APIC_ACCESS_AVAILABLE; case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_FREQUENCY_MSRS; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_REENLIGHTENMENT; case HV_X64_MSR_TSC_INVARIANT_CONTROL: return hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return hv_vcpu->cpuid_cache.features_edx & HV_FEATURE_DEBUG_MSRS_AVAILABLE; default: break; } return false; } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: hv->hv_guest_os_id = data; /* setting guest os id to zero disables hypercall page */ if (!hv->hv_guest_os_id) hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; break; case HV_X64_MSR_HYPERCALL: { u8 instructions[9]; int i = 0; u64 addr; /* if guest os id is not set hypercall should remain disabled */ if (!hv->hv_guest_os_id) break; if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { hv->hv_hypercall = data; break; } /* * If Xen and Hyper-V hypercalls are both enabled, disambiguate * the same way Xen itself does, by setting the bit 31 of EAX * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just * going to be clobbered on 64-bit. */ if (kvm_xen_hypercall_enabled(kvm)) { /* orl $0x80000000, %eax */ instructions[i++] = 0x0d; instructions[i++] = 0x00; instructions[i++] = 0x00; instructions[i++] = 0x00; instructions[i++] = 0x80; } /* vmcall/vmmcall */ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i); i += 3; /* ret */ ((unsigned char *)instructions)[i++] = 0xc3; addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) return 1; hv->hv_hypercall = data; break; } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { if (!host) hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED; else hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED; kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); } else { hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET; } break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_set_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, data); case HV_X64_MSR_CRASH_CTL: if (host) return kvm_hv_msr_set_crash_ctl(kvm, data); if (data & HV_CRASH_CTL_CRASH_NOTIFY) { vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n", hv->hv_crash_param[0], hv->hv_crash_param[1], hv->hv_crash_param[2], hv->hv_crash_param[3], hv->hv_crash_param[4]); /* Send notification about crash to user space */ kvm_make_request(KVM_REQ_HV_CRASH, vcpu); } break; case HV_X64_MSR_RESET: if (data == 1) { vcpu_debug(vcpu, "hyper-v reset requested\n"); kvm_make_request(KVM_REQ_HV_RESET, vcpu); } break; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: hv->hv_reenlightenment_control = data; break; case HV_X64_MSR_TSC_EMULATION_CONTROL: hv->hv_tsc_emulation_control = data; break; case HV_X64_MSR_TSC_EMULATION_STATUS: if (data && !host) return 1; hv->hv_tsc_emulation_status = data; break; case HV_X64_MSR_TIME_REF_COUNT: /* read-only, but still ignore it if host-initiated */ if (!host) return 1; break; case HV_X64_MSR_TSC_INVARIANT_CONTROL: /* Only bit 0 is supported */ if (data & ~HV_EXPOSE_INVARIANT_TSC) return 1; /* The feature can't be disabled from the guest */ if (!host && hv->hv_invtsc_control && !data) return 1; hv->hv_invtsc_control = data; break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_set_msr(vcpu, msr, data, host); default: kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } /* Calculate cpu time spent by current task in 100ns units */ static u64 current_task_runtime_100ns(void) { u64 utime, stime; task_cputime_adjusted(current, &utime, &stime); return div_u64(utime + stime, 100); } static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); u32 new_vp_index = (u32)data; if (!host || new_vp_index >= KVM_MAX_VCPUS) return 1; if (new_vp_index == hv_vcpu->vp_index) return 0; /* * The VP index is initialized to vcpu_index by * kvm_hv_vcpu_postcreate so they initially match. Now the * VP index is changing, adjust num_mismatched_vp_indexes if * it now matches or no longer matches vcpu_idx. */ if (hv_vcpu->vp_index == vcpu->vcpu_idx) atomic_inc(&hv->num_mismatched_vp_indexes); else if (new_vp_index == vcpu->vcpu_idx) atomic_dec(&hv->num_mismatched_vp_indexes); hv_vcpu->vp_index = new_vp_index; break; } case HV_X64_MSR_VP_ASSIST_PAGE: { u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT; addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; /* * Clear apic_assist portion of struct hv_vp_assist_page * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ if (__put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; break; } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); case HV_X64_MSR_ICR: return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); case HV_X64_MSR_VP_RUNTIME: if (!host) return 1; hv_vcpu->runtime_offset = data - current_task_runtime_100ns(); break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return synic_set_msr(to_hv_synic(vcpu), msr, data, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; return stimer_set_config(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; return stimer_set_count(to_hv_stimer(vcpu, timer_index), data, host); } case HV_X64_MSR_TSC_FREQUENCY: case HV_X64_MSR_APIC_FREQUENCY: /* read-only, but still ignore it if host-initiated */ if (!host) return 1; break; default: kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } return 0; } static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; struct kvm_hv *hv = to_kvm_hv(kvm); if (unlikely(!host && !hv_check_msr_access(to_hv_vcpu(vcpu), msr))) return 1; switch (msr) { case HV_X64_MSR_GUEST_OS_ID: data = hv->hv_guest_os_id; break; case HV_X64_MSR_HYPERCALL: data = hv->hv_hypercall; break; case HV_X64_MSR_TIME_REF_COUNT: data = get_time_ref_counter(kvm); break; case HV_X64_MSR_REFERENCE_TSC: data = hv->hv_tsc_page; break; case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: return kvm_hv_msr_get_crash_data(kvm, msr - HV_X64_MSR_CRASH_P0, pdata); case HV_X64_MSR_CRASH_CTL: return kvm_hv_msr_get_crash_ctl(kvm, pdata); case HV_X64_MSR_RESET: data = 0; break; case HV_X64_MSR_REENLIGHTENMENT_CONTROL: data = hv->hv_reenlightenment_control; break; case HV_X64_MSR_TSC_EMULATION_CONTROL: data = hv->hv_tsc_emulation_control; break; case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; case HV_X64_MSR_TSC_INVARIANT_CONTROL: data = hv->hv_invtsc_control; break; case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: return syndbg_get_msr(vcpu, msr, pdata, host); default: kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; return 0; } static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); if (unlikely(!host && !hv_check_msr_access(hv_vcpu, msr))) return 1; switch (msr) { case HV_X64_MSR_VP_INDEX: data = hv_vcpu->vp_index; break; case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_VP_ASSIST_PAGE: data = hv_vcpu->hv_vapic; break; case HV_X64_MSR_VP_RUNTIME: data = current_task_runtime_100ns() + hv_vcpu->runtime_offset; break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: case HV_X64_MSR_SIEFP: case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: case HV_X64_MSR_STIMER3_CONFIG: { int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2; return stimer_get_config(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_STIMER0_COUNT: case HV_X64_MSR_STIMER1_COUNT: case HV_X64_MSR_STIMER2_COUNT: case HV_X64_MSR_STIMER3_COUNT: { int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2; return stimer_get_count(to_hv_stimer(vcpu, timer_index), pdata); } case HV_X64_MSR_TSC_FREQUENCY: data = (u64)vcpu->arch.virtual_tsc_khz * 1000; break; case HV_X64_MSR_APIC_FREQUENCY: data = APIC_BUS_FREQUENCY; break; default: kvm_pr_unimpl_rdmsr(vcpu, msr); return 1; } *pdata = data; return 0; } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!host && !vcpu->arch.hyperv_enabled) return 1; if (kvm_hv_vcpu_init(vcpu)) return 1; if (kvm_hv_msr_partition_wide(msr)) { int r; mutex_lock(&hv->hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); } int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); if (!host && !vcpu->arch.hyperv_enabled) return 1; if (kvm_hv_vcpu_init(vcpu)) return 1; if (kvm_hv_msr_partition_wide(msr)) { int r; mutex_lock(&hv->hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&hv->hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata, host); } static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, unsigned long *vcpu_mask) { struct kvm_hv *hv = to_kvm_hv(kvm); bool has_mismatch = atomic_read(&hv->num_mismatched_vp_indexes); u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; struct kvm_vcpu *vcpu; int bank, sbank = 0; unsigned long i; u64 *bitmap; BUILD_BUG_ON(sizeof(vp_bitmap) > sizeof(*vcpu_mask) * BITS_TO_LONGS(KVM_MAX_VCPUS)); /* * If vp_index == vcpu_idx for all vCPUs, fill vcpu_mask directly, else * fill a temporary buffer and manually test each vCPU's VP index. */ if (likely(!has_mismatch)) bitmap = (u64 *)vcpu_mask; else bitmap = vp_bitmap; /* * Each set of 64 VPs is packed into sparse_banks, with valid_bank_mask * having a '1' for each bank that exists in sparse_banks. Sets must * be in ascending order, i.e. bank0..bankN. */ memset(bitmap, 0, sizeof(vp_bitmap)); for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, KVM_HV_MAX_SPARSE_VCPU_SET_BITS) bitmap[bank] = sparse_banks[sbank++]; if (likely(!has_mismatch)) return; bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap)) __set_bit(i, vcpu_mask); } } static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[]) { int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK; unsigned long sbank; if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask)) return false; /* * The index into the sparse bank is the number of preceding bits in * the valid mask. Optimize for VMs with <64 vCPUs by skipping the * fancy math if there can't possibly be preceding bits. */ if (valid_bit_nr) sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0)); else sbank = 0; return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK, (unsigned long *)&sparse_banks[sbank]); } struct kvm_hv_hcall { /* Hypercall input data */ u64 param; u64 ingpa; u64 outgpa; u16 code; u16 var_cnt; u16 rep_cnt; u16 rep_idx; bool fast; bool rep; sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; /* * Current read offset when KVM reads hypercall input data gradually, * either offset in bytes from 'ingpa' for regular hypercalls or the * number of already consumed 'XMM halves' for 'fast' hypercalls. */ union { gpa_t data_offset; int consumed_xmm_halves; }; }; static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, u16 orig_cnt, u16 cnt_cap, u64 *data) { /* * Preserve the original count when ignoring entries via a "cap", KVM * still needs to validate the guest input (though the non-XMM path * punts on the checks). */ u16 cnt = min(orig_cnt, cnt_cap); int i, j; if (hc->fast) { /* * Each XMM holds two sparse banks, but do not count halves that * have already been consumed for hypercall parameters. */ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves) return HV_STATUS_INVALID_HYPERCALL_INPUT; for (i = 0; i < cnt; i++) { j = i + hc->consumed_xmm_halves; if (j % 2) data[i] = sse128_hi(hc->xmm[j / 2]); else data[i] = sse128_lo(hc->xmm[j / 2]); } return 0; } return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data, cnt * sizeof(*data)); } static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 *sparse_banks) { if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) return -EINVAL; /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, sparse_banks); } static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[]) { return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries); } static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo, u64 *entries, int count) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY; if (!hv_vcpu) return; spin_lock(&tlb_flush_fifo->write_lock); /* * All entries should fit on the fifo leaving one free for 'flush all' * entry in case another request comes in. In case there's not enough * space, just put 'flush all' entry there. */ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) { WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count); goto out_unlock; } /* * Note: full fifo always contains 'flush all' entry, no need to check the * return value. */ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1); out_unlock: spin_unlock(&tlb_flush_fifo->write_lock); } int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE]; int i, j, count; gva_t gva; if (!tdp_enabled || !hv_vcpu) return -EINVAL; tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu)); count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE); for (i = 0; i < count; i++) { if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY) goto out_flush_all; /* * Lower 12 bits of 'address' encode the number of additional * pages to flush. */ gva = entries[i] & PAGE_MASK; for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++) static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE); ++vcpu->stat.tlb_flush; } return 0; out_flush_all: kfifo_reset_out(&tlb_flush_fifo->entries); /* Fall back to full flush. */ return -ENOSPC; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo; /* * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE' * entries on the TLB flush fifo. The last entry, however, needs to be * always left free for 'flush all' entry which gets placed when * there is not enough space to put all the requested entries. */ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1]; u64 *tlb_flush_entries; u64 valid_bank_mask; struct kvm_vcpu *v; unsigned long i; bool all_cpus; /* * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS * sparse banks. Fail the build if KVM's max allowed number of * vCPUs (>4096) exceeds this limit. */ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS); /* * 'Slow' hypercall's first parameter is the address in guest's memory * where hypercall parameters are placed. This is either a GPA or a * nested GPA when KVM is handling the call from L2 ('direct' TLB * flush). Translate the address here so the memory can be uniformly * read with kvm_read_guest(). */ if (!hc->fast && is_guest_mode(vcpu)) { hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL); if (unlikely(hc->ingpa == INVALID_GPA)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) { if (hc->fast) { flush.address_space = hc->ingpa; flush.flags = hc->outgpa; flush.processor_mask = sse128_lo(hc->xmm[0]); hc->consumed_xmm_halves = 1; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; hc->data_offset = sizeof(flush); } trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags, is_guest_mode(vcpu)); valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; /* * Work around possible WS2012 bug: it sends hypercalls * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, * while also expecting us to flush something and crashing if * we don't. Let's treat processor_mask == 0 same as * HV_FLUSH_ALL_PROCESSORS. */ all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || flush.processor_mask == 0; } else { if (hc->fast) { flush_ex.address_space = hc->ingpa; flush_ex.flags = hc->outgpa; memcpy(&flush_ex.hv_vp_set, &hc->xmm[0], sizeof(hc->xmm[0])); hc->consumed_xmm_halves = 2; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; hc->data_offset = sizeof(flush_ex); } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, flush_ex.hv_vp_set.format, flush_ex.address_space, flush_ex.flags, is_guest_mode(vcpu)); valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask; all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (!all_cpus) { if (!hc->var_cnt) goto ret_success; if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } /* * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs' * case (HV_GENERIC_SET_ALL). Always adjust data_offset and * consumed_xmm_halves to make sure TLB flush entries are read * from the correct offset. */ if (hc->fast) hc->consumed_xmm_halves += hc->var_cnt; else hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]); } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX || hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { tlb_flush_entries = NULL; } else { if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries)) return HV_STATUS_INVALID_HYPERCALL_INPUT; tlb_flush_entries = __tlb_flush_entries; } /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ if (all_cpus && !is_guest_mode(vcpu)) { kvm_for_each_vcpu(i, v, kvm) { tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH); } else if (!is_guest_mode(vcpu)) { sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask); for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) { v = kvm_get_vcpu(kvm, i); if (!v) continue; tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } else { struct kvm_vcpu_hv *hv_v; bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, v, kvm) { hv_v = to_hv_vcpu(v); /* * The following check races with nested vCPUs entering/exiting * and/or migrating between L1's vCPUs, however the only case when * KVM *must* flush the TLB is when the target L2 vCPU keeps * running on the same L1 vCPU from the moment of the request until * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other * cases, e.g. when the target L2 vCPU migrates to a different L1 * vCPU or when the corresponding L1 vCPU temporary switches to a * different L2 vCPU while the request is being processed. */ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id) continue; if (!all_cpus && !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask, sparse_banks)) continue; __set_bit(i, vcpu_mask); tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true); hv_tlb_flush_enqueue(v, tlb_flush_fifo, tlb_flush_entries, hc->rep_cnt); } kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask); } ret_success: /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ return (u64)HV_STATUS_SUCCESS | ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, u64 *sparse_banks, u64 valid_bank_mask) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, .vector = vector }; struct kvm_vcpu *vcpu; unsigned long i; kvm_for_each_vcpu(i, vcpu, kvm) { if (sparse_banks && !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), valid_bank_mask, sparse_banks)) continue; /* We fail only when APIC is disabled */ kvm_apic_set_irq(vcpu, &irq, NULL); } } static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); u64 *sparse_banks = hv_vcpu->sparse_banks; struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; u64 valid_bank_mask; u32 vector; bool all_cpus; if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, sizeof(send_ipi)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = send_ipi.cpu_mask; vector = send_ipi.vector; } else { /* 'reserved' part of hv_send_ipi should be 0 */ if (unlikely(hc->ingpa >> 32 != 0)) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = hc->outgpa; vector = (u32)hc->ingpa; } all_cpus = false; valid_bank_mask = BIT_ULL(0); trace_kvm_hv_send_ipi(vector, sparse_banks[0]); } else { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, sizeof(send_ipi_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; } else { send_ipi_ex.vector = (u32)hc->ingpa; send_ipi_ex.vp_set.format = hc->outgpa; send_ipi_ex.vp_set.valid_bank_mask = sse128_lo(hc->xmm[0]); } trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, send_ipi_ex.vp_set.format, send_ipi_ex.vp_set.valid_bank_mask); vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; if (hc->var_cnt != hweight64(valid_bank_mask)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) goto check_and_send_ipi; if (!hc->var_cnt) goto ret_success; if (!hc->fast) hc->data_offset = offsetof(struct hv_send_ipi_ex, vp_set.bank_contents); else hc->consumed_xmm_halves = 1; if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } check_and_send_ipi: if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); else kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); ret_success: return HV_STATUS_SUCCESS; } void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_cpuid_entry2 *entry; vcpu->arch.hyperv_enabled = hyperv_enabled; if (!hv_vcpu) { /* * KVM should have already allocated kvm_vcpu_hv if Hyper-V is * enabled in CPUID. */ WARN_ON_ONCE(vcpu->arch.hyperv_enabled); return; } memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache)); if (!vcpu->arch.hyperv_enabled) return; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); if (entry) { hv_vcpu->cpuid_cache.features_eax = entry->eax; hv_vcpu->cpuid_cache.features_ebx = entry->ebx; hv_vcpu->cpuid_cache.features_edx = entry->edx; } entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO); if (entry) { hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax; hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx; } entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); if (entry) hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES); if (entry) { hv_vcpu->cpuid_cache.nested_eax = entry->eax; hv_vcpu->cpuid_cache.nested_ebx = entry->ebx; } } int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce) { struct kvm_vcpu_hv *hv_vcpu; int ret = 0; if (!to_hv_vcpu(vcpu)) { if (enforce) { ret = kvm_hv_vcpu_init(vcpu); if (ret) return ret; } else { return 0; } } hv_vcpu = to_hv_vcpu(vcpu); hv_vcpu->enforce_cpuid = enforce; return ret; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) { bool longmode; longmode = is_64_bit_hypercall(vcpu); if (longmode) kvm_rax_write(vcpu, result); else { kvm_rdx_write(vcpu, result >> 32); kvm_rax_write(vcpu, result & 0xffffffff); } } static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { u32 tlb_lock_count = 0; int ret; if (hv_result_success(result) && is_guest_mode(vcpu) && kvm_hv_is_tlb_flush_hcall(vcpu) && kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa, &tlb_lock_count, sizeof(tlb_lock_count))) result = HV_STATUS_INVALID_HYPERCALL_INPUT; trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; ret = kvm_skip_emulated_instruction(vcpu); if (tlb_lock_count) kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu); return ret; } static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) { return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) { struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); struct eventfd_ctx *eventfd; if (unlikely(!hc->fast)) { int ret; gpa_t gpa = hc->ingpa; if ((gpa & (__alignof__(hc->ingpa) - 1)) || offset_in_page(gpa) + sizeof(hc->ingpa) > PAGE_SIZE) return HV_STATUS_INVALID_ALIGNMENT; ret = kvm_vcpu_read_guest(vcpu, gpa, &hc->ingpa, sizeof(hc->ingpa)); if (ret < 0) return HV_STATUS_INVALID_ALIGNMENT; } /* * Per spec, bits 32-47 contain the extra "flag number". However, we * have no use for it, and in all known usecases it is zero, so just * report lookup failure if it isn't. */ if (hc->ingpa & 0xffff00000000ULL) return HV_STATUS_INVALID_PORT_ID; /* remaining bits are reserved-zero */ if (hc->ingpa & ~KVM_HYPERV_CONN_ID_MASK) return HV_STATUS_INVALID_HYPERCALL_INPUT; /* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */ rcu_read_lock(); eventfd = idr_find(&hv->conn_to_evt, hc->ingpa); rcu_read_unlock(); if (!eventfd) return HV_STATUS_INVALID_PORT_ID; eventfd_signal(eventfd, 1); return HV_STATUS_SUCCESS; } static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) { switch (hc->code) { case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: case HVCALL_SEND_IPI_EX: return true; } return false; } static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) { int reg; kvm_fpu_get(); for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) _kvm_read_sse_reg(reg, &hc->xmm[reg]); kvm_fpu_put(); } static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) { if (!hv_vcpu->enforce_cpuid) return true; switch (code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: return hv_vcpu->cpuid_cache.enlightenments_ebx && hv_vcpu->cpuid_cache.enlightenments_ebx != U32_MAX; case HVCALL_POST_MESSAGE: return hv_vcpu->cpuid_cache.features_ebx & HV_POST_MESSAGES; case HVCALL_SIGNAL_EVENT: return hv_vcpu->cpuid_cache.features_ebx & HV_SIGNAL_EVENTS; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: case HVCALL_RESET_DEBUG_SESSION: /* * Return 'true' when SynDBG is disabled so the resulting code * will be HV_STATUS_INVALID_HYPERCALL_CODE. */ return !kvm_hv_is_syndbg_enabled(hv_vcpu->vcpu) || hv_vcpu->cpuid_cache.features_ebx & HV_DEBUGGING; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (!(hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; case HVCALL_SEND_IPI_EX: if (!(hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; fallthrough; case HVCALL_SEND_IPI: return hv_vcpu->cpuid_cache.enlightenments_eax & HV_X64_CLUSTER_IPI_RECOMMENDED; case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: return hv_vcpu->cpuid_cache.features_ebx & HV_ENABLE_EXTENDED_HYPERCALLS; default: break; } return true; } int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_hv_hcall hc; u64 ret = HV_STATUS_SUCCESS; /* * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } #ifdef CONFIG_X86_64 if (is_64_bit_hypercall(vcpu)) { hc.param = kvm_rcx_read(vcpu); hc.ingpa = kvm_rdx_read(vcpu); hc.outgpa = kvm_r8_read(vcpu); } else #endif { hc.param = ((u64)kvm_rdx_read(vcpu) << 32) | (kvm_rax_read(vcpu) & 0xffffffff); hc.ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | (kvm_rcx_read(vcpu) & 0xffffffff); hc.outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | (kvm_rsi_read(vcpu) & 0xffffffff); } hc.code = hc.param & 0xffff; hc.var_cnt = (hc.param & HV_HYPERCALL_VARHEAD_MASK) >> HV_HYPERCALL_VARHEAD_OFFSET; hc.fast = !!(hc.param & HV_HYPERCALL_FAST_BIT); hc.rep_cnt = (hc.param >> HV_HYPERCALL_REP_COMP_OFFSET) & 0xfff; hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); trace_kvm_hv_hypercall(hc.code, hc.fast, hc.var_cnt, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } if (unlikely(hc.param & HV_HYPERCALL_RSVD_MASK)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; goto hypercall_complete; } if (hc.fast && is_xmm_fast_hypercall(&hc)) { if (unlikely(hv_vcpu->enforce_cpuid && !(hv_vcpu->cpuid_cache.features_edx & HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } kvm_hv_hypercall_read_xmm(&hc); } switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } kvm_vcpu_on_spin(vcpu, true); break; case HVCALL_SIGNAL_EVENT: if (unlikely(hc.rep || hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hvcall_signal_event(vcpu, &hc); if (ret != HV_STATUS_INVALID_PORT_ID) break; fallthrough; /* maybe userspace knows this conn_id */ case HVCALL_POST_MESSAGE: /* don't bother userspace if it has no way to handle it */ if (unlikely(hc.rep || hc.var_cnt || !to_hv_synic(vcpu)->active)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } goto hypercall_userspace_exit; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: if (unlikely(!hc.rep_cnt || hc.rep_idx)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_flush_tlb(vcpu, &hc); break; case HVCALL_SEND_IPI: if (unlikely(hc.var_cnt)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } fallthrough; case HVCALL_SEND_IPI_EX: if (unlikely(hc.rep)) { ret = HV_STATUS_INVALID_HYPERCALL_INPUT; break; } ret = kvm_hv_send_ipi(vcpu, &hc); break; case HVCALL_POST_DEBUG_DATA: case HVCALL_RETRIEVE_DEBUG_DATA: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } fallthrough; case HVCALL_RESET_DEBUG_SESSION: { struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu); if (!kvm_hv_is_syndbg_enabled(vcpu)) { ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } if (!(syndbg->options & HV_X64_SYNDBG_OPTION_USE_HCALLS)) { ret = HV_STATUS_OPERATION_DENIED; break; } goto hypercall_userspace_exit; } case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } goto hypercall_userspace_exit; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); hypercall_userspace_exit: vcpu->run->exit_reason = KVM_EXIT_HYPERV; vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; vcpu->run->hyperv.u.hcall.input = hc.param; vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; } void kvm_hv_init_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); mutex_init(&hv->hv_lock); idr_init(&hv->conn_to_evt); } void kvm_hv_destroy_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int i; idr_for_each_entry(&hv->conn_to_evt, eventfd, i) eventfd_ctx_put(eventfd); idr_destroy(&hv->conn_to_evt); } static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; int ret; eventfd = eventfd_ctx_fdget(fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); mutex_lock(&hv->hv_lock); ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1, GFP_KERNEL_ACCOUNT); mutex_unlock(&hv->hv_lock); if (ret >= 0) return 0; if (ret == -ENOSPC) ret = -EEXIST; eventfd_ctx_put(eventfd); return ret; } static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id) { struct kvm_hv *hv = to_kvm_hv(kvm); struct eventfd_ctx *eventfd; mutex_lock(&hv->hv_lock); eventfd = idr_remove(&hv->conn_to_evt, conn_id); mutex_unlock(&hv->hv_lock); if (!eventfd) return -ENOENT; synchronize_srcu(&kvm->srcu); eventfd_ctx_put(eventfd); return 0; } int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args) { if ((args->flags & ~KVM_HYPERV_EVENTFD_DEASSIGN) || (args->conn_id & ~KVM_HYPERV_CONN_ID_MASK)) return -EINVAL; if (args->flags == KVM_HYPERV_EVENTFD_DEASSIGN) return kvm_hv_eventfd_deassign(kvm, args->conn_id); return kvm_hv_eventfd_assign(kvm, args->conn_id, args->fd); } int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { uint16_t evmcs_ver = 0; struct kvm_cpuid_entry2 cpuid_entries[] = { { .function = HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_INTERFACE }, { .function = HYPERV_CPUID_VERSION }, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); if (kvm_x86_ops.nested_ops->get_evmcs_version) evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); if (cpuid->nent < nent) return -E2BIG; if (cpuid->nent > nent) cpuid->nent = nent; for (i = 0; i < nent; i++) { struct kvm_cpuid_entry2 *ent = &cpuid_entries[i]; u32 signature[3]; switch (ent->function) { case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_INTERFACE: ent->eax = HYPERV_CPUID_SIGNATURE_EAX; break; case HYPERV_CPUID_VERSION: /* * We implement some Hyper-V 2016 functions so let's use * this version. */ ent->eax = 0x00003839; ent->ebx = 0x000A0000; break; case HYPERV_CPUID_FEATURES: ent->eax |= HV_MSR_VP_RUNTIME_AVAILABLE; ent->eax |= HV_MSR_TIME_REF_COUNT_AVAILABLE; ent->eax |= HV_MSR_SYNIC_AVAILABLE; ent->eax |= HV_MSR_SYNTIMER_AVAILABLE; ent->eax |= HV_MSR_APIC_ACCESS_AVAILABLE; ent->eax |= HV_MSR_HYPERCALL_AVAILABLE; ent->eax |= HV_MSR_VP_INDEX_AVAILABLE; ent->eax |= HV_MSR_RESET_AVAILABLE; ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE; ent->eax |= HV_ACCESS_FREQUENCY_MSRS; ent->eax |= HV_ACCESS_REENLIGHTENMENT; ent->eax |= HV_ACCESS_TSC_INVARIANT; ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; ent->ebx |= HV_DEBUGGING; ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH; /* * Direct Synthetic timers only make sense with in-kernel * LAPIC */ if (!vcpu || lapic_in_kernel(vcpu)) ent->edx |= HV_STIMER_DIRECT_MODE_AVAILABLE; break; case HYPERV_CPUID_ENLIGHTMENT_INFO: ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. */ ent->ebx = 0x00000FFF; break; case HYPERV_CPUID_IMPLEMENT_LIMITS: /* Maximum number of virtual processors */ ent->eax = KVM_MAX_VCPUS; /* * Maximum number of logical processors, matches * HyperV 2016. */ ent->ebx = 64; break; case HYPERV_CPUID_NESTED_FEATURES: ent->eax = evmcs_ver; ent->eax |= HV_X64_NESTED_DIRECT_FLUSH; ent->eax |= HV_X64_NESTED_MSR_BITMAP; ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; break; case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); ent->eax = 0; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; break; case HYPERV_CPUID_SYNDBG_INTERFACE: memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); ent->eax = signature[0]; break; case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; break; default: break; } } if (copy_to_user(entries, cpuid_entries, nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; return 0; }
6 10 10 6 6 6 6 6 6 6 6 133 2 14 79 71 2 10 1 1 1 6 6 6 1 5 6 6 6 6 182 18 72 7 134 6 21 18 1 2 147 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h> static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); u32 elapsed, user_timeout; s32 remaining; user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp; if (tp->tcp_usec_ts) elapsed /= USEC_PER_MSEC; remaining = user_timeout - elapsed; if (remaining <= 0) return 1; /* user timeout has passed; fire ASAP */ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining)); } u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) { struct inet_connection_sock *icsk = inet_csk(sk); u32 remaining, user_timeout; s32 elapsed; user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (!user_timeout || !icsk->icsk_probes_tstamp) return when; elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp; if (unlikely(elapsed < 0)) elapsed = 0; remaining = msecs_to_jiffies(user_timeout) - elapsed; remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN); return min_t(u32, remaining, when); } /** * tcp_write_err() - close socket and save error info * @sk: The socket the error has appeared on. * * Returns: Nothing (void) */ static void tcp_write_err(struct sock *sk) { WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); sk_error_report(sk); tcp_write_queue_purge(sk); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /** * tcp_out_of_resources() - Close socket if out of resources * @sk: pointer to current socket * @do_reset: send a last packet with reset flag * * Do not allow orphaned sockets to eat all our resources. * This is direct violation of TCP specs, but it is required * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * * Also close if our net namespace is exiting; in that case there is no * hope of ever communicating again since all netns interfaces are already * down (or about to be down), and we need to release our dst references, * which have been moved to the netns loopback interface, so the namespace * can finish exiting. This condition is only possible if we are a kernel * socket, as those do not hold references to the namespace. * * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. * 2. If we have strong memory pressure. * 3. If our net namespace is exiting. */ static int tcp_out_of_resources(struct sock *sk, bool do_reset) { struct tcp_sock *tp = tcp_sk(sk); int shift = 0; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ if (READ_ONCE(sk->sk_err_soft)) shift++; if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_jiffies32 - tp->lsndtime) <= TCP_TIMEWAIT_LEN || /* 2. Window is closed. */ (!tp->snd_wnd && !tp->packets_out)) do_reset = true; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_done(sk); return 1; } return 0; } /** * tcp_orphan_retries() - Returns maximal number of retries on an orphaned socket * @sk: Pointer to the current socket. * @alive: bool, socket alive state */ static int tcp_orphan_retries(struct sock *sk, bool alive) { int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (READ_ONCE(sk->sk_err_soft) && !alive) retries = 0; /* However, if socket sent something recently, select some safe * number of retries. 8 corresponds to >100 seconds with minimal * RTO of 200msec. */ if (retries == 0 && alive) retries = 8; return retries; } static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) { const struct net *net = sock_net(sk); int mss; /* Black hole detection */ if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing)) return; if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } else { mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss); mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor)); mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss)); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); } tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } static unsigned int tcp_model_timeout(struct sock *sk, unsigned int boundary, unsigned int rto_base) { unsigned int linear_backoff_thresh, timeout; linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); if (boundary <= linear_backoff_thresh) timeout = ((2 << boundary) - 1) * rto_base; else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; return jiffies_to_msecs(timeout); } /** * retransmits_timed_out() - returns true if this connection has timed out * @sk: The current socket * @boundary: max number of retransmissions * @timeout: A custom timeout value. * If set to 0 the default timeout is calculated and used. * Using TCP_RTO_MIN and the number of unsuccessful retransmits. * * The default "timeout" value this function can calculate and use * is equivalent to the timeout of a TCP Connection * after "boundary" unsuccessful, exponentially backed-off * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { struct tcp_sock *tp = tcp_sk(sk); unsigned int start_ts, delta; if (!inet_csk(sk)->icsk_retransmits) return false; start_ts = tp->retrans_stamp; if (likely(timeout == 0)) { unsigned int rto_base = TCP_RTO_MIN; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) rto_base = tcp_timeout_init(sk); timeout = tcp_model_timeout(sk, boundary, rto_base); } if (tp->tcp_usec_ts) { /* delta maybe off up to a jiffy due to timer granularity. */ delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1); return (s32)(delta - timeout * USEC_PER_MSEC) >= 0; } return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0; } /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); bool expired = false, do_reset; int retry_until, max_retransmits; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) __dst_negative_advice(sk); /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ retry_until = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); max_retransmits = retry_until; if (sk->sk_state == TCP_SYN_SENT) max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts); expired = icsk->icsk_retransmits >= max_retransmits; } else { if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); __dst_negative_advice(sk); } retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < TCP_RTO_MAX; retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } if (!expired) expired = retransmits_timed_out(sk, retry_until, READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, icsk->icsk_retransmits, icsk->icsk_rto, (int)expired); if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; } if (sk_rethink_txhash(sk)) { tp->timeout_rehash++; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); } return 0; } /* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; /* Handling the sack compression case */ if (tp->compressed_ack) { tcp_mstamp_refresh(tp); tcp_sack_compress_send_ack(sk); return; } if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) return; if (time_after(icsk->icsk_ack.timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); return; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (inet_csk_ack_scheduled(sk)) { if (!inet_csk_in_pingpong_mode(sk)) { /* Delayed ACK missed: inflate ATO. */ icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_mstamp_refresh(tp); tcp_send_ack(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } } /** * tcp_delack_timer() - The TCP delayed ACK timeout handler * @t: Pointer to the timer. (gets casted to struct sock *) * * This function gets (indirectly) called when the kernel timer for a TCP packet * of this socket expires. Calls tcp_delack_timer_handler() to do the actual work. * * Returns: Nothing (void) */ static void tcp_delack_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); } static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; return; } /* RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as * long as the receiver continues to respond probes. We support this by * default and reset icsk_probes_out with incoming ACKs. But if the * socket is orphaned or the user specifies TCP_USER_TIMEOUT, we * kill the socket when the retry count and the time exceeds the * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ if (!icsk->icsk_probes_tstamp) { icsk->icsk_probes_tstamp = tcp_jiffies32; } else { u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); if (user_timeout && (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= msecs_to_jiffies(user_timeout)) goto abort; } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) goto abort; if (tcp_out_of_resources(sk, true)) return; } if (icsk->icsk_probes_out >= max_probes) { abort: tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ tcp_send_probe0(sk); } } static void tcp_update_rto_stats(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); if (!icsk->icsk_retransmits) { tp->total_rto_recoveries++; tp->rto_stamp = tcp_time_stamp_ms(tp); } icsk->icsk_retransmits++; tp->total_rto++; } /* * Timer for Fast Open socket to retransmit SYNACK. Note that the * sk here is the child socket, not the parent (listener) socket. */ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_retries; req->rsk_ops->syn_ack_timeout(req); /* Add one more retry for fastopen. * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */ max_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1; if (req->num_timeout >= max_retries) { tcp_write_err(sk); return; } /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */ if (icsk->icsk_retransmits == 1) tcp_enter_loss(sk); /* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error * returned from rtx_syn_ack() to make it more persistent like * regular retransmit because if the child socket has been accepted * it's not good to give up too easily. */ inet_rtx_syn_ack(sk, req); req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp_ts(tp); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, req->timeout << req->num_timeout, TCP_RTO_MAX); } static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct sk_buff *skb, u32 rtx_delta) { const struct tcp_sock *tp = tcp_sk(sk); const int timeout = TCP_RTO_MAX * 2; u32 rcv_delta; rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; return msecs_to_jiffies(rtx_delta) > timeout; } /** * tcp_retransmit_timer() - The TCP retransmit timeout handler * @sk: Pointer to the current socket. * * This function gets called when the kernel timer for a TCP packet * of this socket expires. * * It handles retransmission, timer adjustment and other necessary measures. * * Returns: Nothing (void) */ void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *req; struct sk_buff *skb; req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); tcp_fastopen_synack_timer(sk, req); /* Before we receive ACK to our SYN-ACK don't retransmit * anything else (e.g., data or FIN segments). */ return; } if (!tp->packets_out) return; skb = tcp_rtx_queue_head(sk); if (WARN_ON_ONCE(!skb)) return; tp->tlp_high_seq = 0; if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { /* Receiver dastardly shrinks window. Our retransmits * become zero probes, but we should not timeout this * connection. If the socket is an orphan, time it out, * we cannot allow such beasts to hang infinitely. */ struct inet_sock *inet = inet_sk(sk); u32 rtx_delta; rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); if (tp->tcp_usec_ts) rtx_delta /= USEC_PER_MSEC; if (sk->sk_family == AF_INET) { net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &inet->inet_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt, jiffies_to_msecs(jiffies - tp->rcv_tstamp), rtx_delta); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n", &sk->sk_v6_daddr, ntohs(inet->inet_dport), inet->inet_num, tp->snd_una, tp->snd_nxt, jiffies_to_msecs(jiffies - tp->rcv_tstamp), rtx_delta); } #endif if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) { tcp_write_err(sk); goto out; } tcp_enter_loss(sk); tcp_retransmit_skb(sk, skb, 1); __sk_dst_reset(sk); goto out_reset_timer; } __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); if (tcp_write_timeout(sk)) goto out; if (icsk->icsk_retransmits == 0) { int mib_idx = 0; if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; } else if (icsk->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) || tp->sacked_out) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKFAILURES; else mib_idx = LINUX_MIB_TCPRENOFAILURES; } if (mib_idx) __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); tcp_update_rto_stats(sk); if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * Let senders fight for local resources conservatively. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_RESOURCE_PROBE_INTERVAL, TCP_RTO_MAX); goto out; } /* Increase the timeout each time we retransmit. Note that * we do not increase the rtt estimate. rto is initialized * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests * that doubling rto each time is the least we can get away with. * In KA9Q, Karn uses this for the first few times, and then * goes to quadratic. netBSD doubles, but only goes up to *64, * and clamps at 1 to 64 sec afterwards. Note that 120 sec is * defined in the protocol as the maximum possible RTT. I guess * we'll have to use something other than TCP to talk to the * University of Mars. * * PAWS allows us longer timeouts and large windows, so once * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ icsk->icsk_backoff++; out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is * used to reset timer, set to 0. Recalculate 'icsk_rto' as this * might be increased if the stream oscillates between thin and thick, * thus the old value might already be too high compared to the value * set by 'tcp_set_rto' in tcp_input.c which resets the rto without * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating * exponential backoff behaviour to avoid continue hammering * linear-timeout retransmissions into a black hole */ if (sk->sk_state == TCP_ESTABLISHED && (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) && tcp_stream_is_thin(tp) && icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { icsk->icsk_backoff = 0; icsk->icsk_rto = clamp(__tcp_set_rto(tp), tcp_rto_min(sk), TCP_RTO_MAX); } else if (sk->sk_state != TCP_SYN_SENT || icsk->icsk_backoff > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { /* Use normal (exponential) backoff unless linear timeouts are * activated. */ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) __sk_dst_reset(sk); out:; } /* Called with bottom-half processing disabled. Called by tcp_write_timer() */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int event; if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || !icsk->icsk_pending) return; if (time_after(icsk->icsk_timeout, jiffies)) { sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); return; } tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; switch (event) { case ICSK_TIME_REO_TIMEOUT: tcp_rack_reo_timeout(sk); break; case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); break; case ICSK_TIME_RETRANS: icsk->icsk_pending = 0; tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: icsk->icsk_pending = 0; tcp_probe_timer(sk); break; } } static void tcp_write_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_write_timer_handler(sk); } else { /* delegate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); } void tcp_syn_ack_timeout(const struct request_sock *req) { struct net *net = read_pnet(&inet_rsk(req)->ireq_net); __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } EXPORT_SYMBOL(tcp_syn_ack_timeout); void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) inet_csk_delete_keepalive_timer(sk); } EXPORT_SYMBOL_GPL(tcp_set_keepalive); static void tcp_keepalive_timer (struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; /* Only process if socket is not in use. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ inet_csk_reset_keepalive_timer (sk, HZ/20); goto out; } if (sk->sk_state == TCP_LISTEN) { pr_err("Hmm... keepalive on a LISTEN ???\n"); goto out; } tcp_mstamp_refresh(tp); if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (READ_ONCE(tp->linger2) >= 0) { const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } tcp_send_active_reset(sk, GFP_ATOMIC); goto death; } if (!sock_flag(sk, SOCK_KEEPOPEN) || ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); /* If the TCP_USER_TIMEOUT option is enabled, use that * to determine when to timeout instead. */ if ((user_timeout != 0 && elapsed >= msecs_to_jiffies(user_timeout) && icsk->icsk_probes_out > 0) || (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; } if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, * try harder. */ elapsed = TCP_RESOURCE_PROBE_INTERVAL; } } else { /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ elapsed = keepalive_time_when(tp) - elapsed; } resched: inet_csk_reset_keepalive_timer (sk, elapsed); goto out; death: tcp_done(sk); out: bh_unlock_sock(sk); sock_put(sk); } static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer); struct sock *sk = (struct sock *)tp; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { if (tp->compressed_ack) { /* Since we have to send one ack finally, * subtract one from tp->compressed_ack to keep * LINUX_MIB_TCPACKCOMPRESSED accurate. */ tp->compressed_ack--; tcp_send_ack(sk); } } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); return HRTIMER_NORESTART; } void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_SOFT); tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_SOFT); tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; }
249 18 131 9 24 105 25 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SEQ_FILE_H #define _LINUX_SEQ_FILE_H #include <linux/types.h> #include <linux/string.h> #include <linux/string_helpers.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/fs.h> #include <linux/cred.h> struct seq_operations; struct seq_file { char *buf; size_t size; size_t from; size_t count; size_t pad_until; loff_t index; loff_t read_pos; struct mutex lock; const struct seq_operations *op; int poll_event; const struct file *file; void *private; }; struct seq_operations { void * (*start) (struct seq_file *m, loff_t *pos); void (*stop) (struct seq_file *m, void *v); void * (*next) (struct seq_file *m, void *v, loff_t *pos); int (*show) (struct seq_file *m, void *v); }; #define SEQ_SKIP 1 /** * seq_has_overflowed - check if the buffer has overflowed * @m: the seq_file handle * * seq_files have a buffer which may overflow. When this happens a larger * buffer is reallocated and all the data will be printed again. * The overflow state is true when m->count == m->size. * * Returns true if the buffer received more than it can hold. */ static inline bool seq_has_overflowed(struct seq_file *m) { return m->count == m->size; } /** * seq_get_buf - get buffer to write arbitrary data to * @m: the seq_file handle * @bufp: the beginning of the buffer is stored here * * Return the number of bytes available in the buffer, or zero if * there's no space. */ static inline size_t seq_get_buf(struct seq_file *m, char **bufp) { BUG_ON(m->count > m->size); if (m->count < m->size) *bufp = m->buf + m->count; else *bufp = NULL; return m->size - m->count; } /** * seq_commit - commit data to the buffer * @m: the seq_file handle * @num: the number of bytes to commit * * Commit @num bytes of data written to a buffer previously acquired * by seq_buf_get. To signal an error condition, or that the data * didn't fit in the available space, pass a negative @num value. */ static inline void seq_commit(struct seq_file *m, int num) { if (num < 0) { m->count = m->size; } else { BUG_ON(m->count + num > m->size); m->count += num; } } /** * seq_setwidth - set padding width * @m: the seq_file handle * @size: the max number of bytes to pad. * * Call seq_setwidth() for setting max width, then call seq_printf() etc. and * finally call seq_pad() to pad the remaining bytes. */ static inline void seq_setwidth(struct seq_file *m, size_t size) { m->pad_until = m->count + size; } void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); __printf(2, 0) void seq_vprintf(struct seq_file *m, const char *fmt, va_list args); __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void seq_puts(struct seq_file *m, const char *s); void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width); void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc); static inline void seq_escape_str(struct seq_file *m, const char *src, unsigned int flags, const char *esc) { seq_escape_mem(m, src, strlen(src), flags, esc); } /** * seq_escape - print string into buffer, escaping some characters * @m: target buffer * @s: NULL-terminated string * @esc: set of characters that need escaping * * Puts string into buffer, replacing each occurrence of character from * @esc with usual octal escape. * * Use seq_has_overflowed() to check for errors. */ static inline void seq_escape(struct seq_file *m, const char *s, const char *esc) { seq_escape_str(m, s, ESCAPE_OCTAL, esc); } void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); int seq_path(struct seq_file *, const struct path *, const char *); int seq_file_path(struct seq_file *, struct file *, const char *); int seq_dentry(struct seq_file *, struct dentry *, const char *); int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc); void *single_start(struct seq_file *, loff_t *); int single_open(struct file *, int (*)(struct seq_file *, void *), void *); int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t); int single_release(struct inode *, struct file *); void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); #ifdef CONFIG_BINARY_PRINTF void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary); #endif #define DEFINE_SEQ_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ int ret = seq_open(file, &__name ## _sops); \ if (!ret && inode->i_private) { \ struct seq_file *seq_f = file->private_data; \ seq_f->private = inode->i_private; \ } \ return ret; \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = seq_release, \ } #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_SHOW_STORE_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .write = __name ## _write, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ .proc_open = __name ## _open, \ .proc_read = seq_read, \ .proc_lseek = seq_lseek, \ .proc_release = single_release, \ } static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS return seq->file->f_cred->user_ns; #else extern struct user_namespace init_user_ns; return &init_user_ns; #endif } /** * seq_show_options - display mount options with appropriate escapes. * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, can be NULL */ static inline void seq_show_option(struct seq_file *m, const char *name, const char *value) { seq_putc(m, ','); seq_escape(m, name, ",= \t\n\\"); if (value) { seq_putc(m, '='); seq_escape(m, value, ", \t\n\\"); } } /** * seq_show_option_n - display mount options with appropriate escapes * where @value must be a specific length (i.e. * not NUL-terminated). * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, cannot be NULL * @length: the exact length of @value to display, must be constant expression * * This is a macro since this uses "length" to define the size of the * stack buffer. */ #define seq_show_option_n(m, name, value, length) { \ char val_buf[length + 1]; \ memcpy(val_buf, value, length); \ val_buf[length] = '\0'; \ seq_show_option(m, name, val_buf); \ } #define SEQ_START_TOKEN ((void *)1) /* * Helpers for iteration over list_head-s in seq_files */ extern struct list_head *seq_list_start(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); extern struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos); /* * Helpers for iteration over hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos); extern struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos); /* Helpers for iterating over per-cpu hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos); extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos); void seq_file_init(void); #endif
106 106 105 106 106 106 106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 IBM Corporation * * Author: Cedric Le Goater <clg@fr.ibm.com> */ #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/sysctl.h> #include <linux/stat.h> #include <linux/capability.h> #include <linux/slab.h> static int msg_max_limit_min = MIN_MSGMAX; static int msg_max_limit_max = HARD_MSGMAX; static int msg_maxsize_limit_min = MIN_MSGSIZEMAX; static int msg_maxsize_limit_max = HARD_MSGSIZEMAX; static struct ctl_table mq_sysctls[] = { { .procname = "queues_max", .data = &init_ipc_ns.mq_queues_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "msg_max", .data = &init_ipc_ns.mq_msg_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, { .procname = "msgsize_max", .data = &init_ipc_ns.mq_msgsize_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, { .procname = "msg_default", .data = &init_ipc_ns.mq_msg_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, { .procname = "msgsize_default", .data = &init_ipc_ns.mq_msgsize_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, {} }; static struct ctl_table_set *set_lookup(struct ctl_table_root *root) { return &current->nsproxy->ipc_ns->mq_set; } static int set_is_seen(struct ctl_table_set *set) { return &current->nsproxy->ipc_ns->mq_set == set; } static struct ctl_table_root set_root = { .lookup = set_lookup, }; bool setup_mq_sysctls(struct ipc_namespace *ns) { struct ctl_table *tbl; setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen); tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL); if (tbl) { int i; for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) { if (tbl[i].data == &init_ipc_ns.mq_queues_max) tbl[i].data = &ns->mq_queues_max; else if (tbl[i].data == &init_ipc_ns.mq_msg_max) tbl[i].data = &ns->mq_msg_max; else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max) tbl[i].data = &ns->mq_msgsize_max; else if (tbl[i].data == &init_ipc_ns.mq_msg_default) tbl[i].data = &ns->mq_msg_default; else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default) tbl[i].data = &ns->mq_msgsize_default; else tbl[i].data = NULL; } ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl, ARRAY_SIZE(mq_sysctls)); } if (!ns->mq_sysctls) { kfree(tbl); retire_sysctl_set(&ns->mq_set); return false; } return true; } void retire_mq_sysctls(struct ipc_namespace *ns) { struct ctl_table *tbl; tbl = ns->mq_sysctls->ctl_table_arg; unregister_sysctl_table(ns->mq_sysctls); retire_sysctl_set(&ns->mq_set); kfree(tbl); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 // SPDX-License-Identifier: GPL-2.0 /* * Native support for the I/O-Warrior USB devices * * Copyright (c) 2003-2005, 2020 Code Mercenaries GmbH * written by Christian Lucht <lucht@codemercs.com> and * Christoph Jung <jung@codemercs.com> * * based on * usb-skeleton.c by Greg Kroah-Hartman <greg@kroah.com> * brlvger.c by Stephane Dalton <sdalton@videotron.ca> * and Stephane Doyon <s.doyon@videotron.ca> * * Released under the GPLv2. */ #include <linux/module.h> #include <linux/usb.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/usb/iowarrior.h> #define DRIVER_AUTHOR "Christian Lucht <lucht@codemercs.com>" #define DRIVER_DESC "USB IO-Warrior driver" #define USB_VENDOR_ID_CODEMERCS 1984 /* low speed iowarrior */ #define USB_DEVICE_ID_CODEMERCS_IOW40 0x1500 #define USB_DEVICE_ID_CODEMERCS_IOW24 0x1501 #define USB_DEVICE_ID_CODEMERCS_IOWPV1 0x1511 #define USB_DEVICE_ID_CODEMERCS_IOWPV2 0x1512 /* full speed iowarrior */ #define USB_DEVICE_ID_CODEMERCS_IOW56 0x1503 /* fuller speed iowarrior */ #define USB_DEVICE_ID_CODEMERCS_IOW28 0x1504 #define USB_DEVICE_ID_CODEMERCS_IOW28L 0x1505 #define USB_DEVICE_ID_CODEMERCS_IOW100 0x1506 /* OEMed devices */ #define USB_DEVICE_ID_CODEMERCS_IOW24SAG 0x158a #define USB_DEVICE_ID_CODEMERCS_IOW56AM 0x158b /* Get a minor range for your devices from the usb maintainer */ #ifdef CONFIG_USB_DYNAMIC_MINORS #define IOWARRIOR_MINOR_BASE 0 #else #define IOWARRIOR_MINOR_BASE 208 // SKELETON_MINOR_BASE 192 + 16, not official yet #endif /* interrupt input queue size */ #define MAX_INTERRUPT_BUFFER 16 /* maximum number of urbs that are submitted for writes at the same time, this applies to the IOWarrior56 only! IOWarrior24 and IOWarrior40 use synchronous usb_control_msg calls. */ #define MAX_WRITES_IN_FLIGHT 4 MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); static struct usb_driver iowarrior_driver; /*--------------*/ /* data */ /*--------------*/ /* Structure to hold all of our device specific stuff */ struct iowarrior { struct mutex mutex; /* locks this structure */ struct usb_device *udev; /* save off the usb device pointer */ struct usb_interface *interface; /* the interface for this device */ unsigned char minor; /* the starting minor number for this device */ struct usb_endpoint_descriptor *int_out_endpoint; /* endpoint for reading (needed for IOW56 only) */ struct usb_endpoint_descriptor *int_in_endpoint; /* endpoint for reading */ struct urb *int_in_urb; /* the urb for reading data */ unsigned char *int_in_buffer; /* buffer for data to be read */ unsigned char serial_number; /* to detect lost packages */ unsigned char *read_queue; /* size is MAX_INTERRUPT_BUFFER * packet size */ wait_queue_head_t read_wait; wait_queue_head_t write_wait; /* wait-queue for writing to the device */ atomic_t write_busy; /* number of write-urbs submitted */ atomic_t read_idx; atomic_t intr_idx; atomic_t overflow_flag; /* signals an index 'rollover' */ int present; /* this is 1 as long as the device is connected */ int opened; /* this is 1 if the device is currently open */ char chip_serial[9]; /* the serial number string of the chip connected */ int report_size; /* number of bytes in a report */ u16 product_id; struct usb_anchor submitted; }; /*--------------*/ /* globals */ /*--------------*/ #define USB_REQ_GET_REPORT 0x01 //#if 0 static int usb_get_report(struct usb_device *dev, struct usb_host_interface *inter, unsigned char type, unsigned char id, void *buf, int size) { return usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), USB_REQ_GET_REPORT, USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE, (type << 8) + id, inter->desc.bInterfaceNumber, buf, size, USB_CTRL_GET_TIMEOUT); } //#endif #define USB_REQ_SET_REPORT 0x09 static int usb_set_report(struct usb_interface *intf, unsigned char type, unsigned char id, void *buf, int size) { return usb_control_msg(interface_to_usbdev(intf), usb_sndctrlpipe(interface_to_usbdev(intf), 0), USB_REQ_SET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE, (type << 8) + id, intf->cur_altsetting->desc.bInterfaceNumber, buf, size, 1000); } /*---------------------*/ /* driver registration */ /*---------------------*/ /* table of devices that work with this driver */ static const struct usb_device_id iowarrior_ids[] = { {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW40)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW24)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOWPV1)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOWPV2)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW24SAG)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56AM)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW28)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW28L)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW100)}, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, iowarrior_ids); /* * USB callback handler for reading data */ static void iowarrior_callback(struct urb *urb) { struct iowarrior *dev = urb->context; int intr_idx; int read_idx; int aux_idx; int offset; int status = urb->status; int retval; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: return; default: goto exit; } intr_idx = atomic_read(&dev->intr_idx); /* aux_idx become previous intr_idx */ aux_idx = (intr_idx == 0) ? (MAX_INTERRUPT_BUFFER - 1) : (intr_idx - 1); read_idx = atomic_read(&dev->read_idx); /* queue is not empty and it's interface 0 */ if ((intr_idx != read_idx) && (dev->interface->cur_altsetting->desc.bInterfaceNumber == 0)) { /* + 1 for serial number */ offset = aux_idx * (dev->report_size + 1); if (!memcmp (dev->read_queue + offset, urb->transfer_buffer, dev->report_size)) { /* equal values on interface 0 will be ignored */ goto exit; } } /* aux_idx become next intr_idx */ aux_idx = (intr_idx == (MAX_INTERRUPT_BUFFER - 1)) ? 0 : (intr_idx + 1); if (read_idx == aux_idx) { /* queue full, dropping oldest input */ read_idx = (++read_idx == MAX_INTERRUPT_BUFFER) ? 0 : read_idx; atomic_set(&dev->read_idx, read_idx); atomic_set(&dev->overflow_flag, 1); } /* +1 for serial number */ offset = intr_idx * (dev->report_size + 1); memcpy(dev->read_queue + offset, urb->transfer_buffer, dev->report_size); *(dev->read_queue + offset + (dev->report_size)) = dev->serial_number++; atomic_set(&dev->intr_idx, aux_idx); /* tell the blocking read about the new data */ wake_up_interruptible(&dev->read_wait); exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&dev->interface->dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } /* * USB Callback handler for write-ops */ static void iowarrior_write_callback(struct urb *urb) { struct iowarrior *dev; int status = urb->status; dev = urb->context; /* sync/async unlink faults aren't errors */ if (status && !(status == -ENOENT || status == -ECONNRESET || status == -ESHUTDOWN)) { dev_dbg(&dev->interface->dev, "nonzero write bulk status received: %d\n", status); } /* free up our allocated buffer */ usb_free_coherent(urb->dev, urb->transfer_buffer_length, urb->transfer_buffer, urb->transfer_dma); /* tell a waiting writer the interrupt-out-pipe is available again */ atomic_dec(&dev->write_busy); wake_up_interruptible(&dev->write_wait); } /* * iowarrior_delete */ static inline void iowarrior_delete(struct iowarrior *dev) { dev_dbg(&dev->interface->dev, "minor %d\n", dev->minor); kfree(dev->int_in_buffer); usb_free_urb(dev->int_in_urb); kfree(dev->read_queue); usb_put_intf(dev->interface); kfree(dev); } /*---------------------*/ /* fops implementation */ /*---------------------*/ static int read_index(struct iowarrior *dev) { int intr_idx, read_idx; read_idx = atomic_read(&dev->read_idx); intr_idx = atomic_read(&dev->intr_idx); return (read_idx == intr_idx ? -1 : read_idx); } /* * iowarrior_read */ static ssize_t iowarrior_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct iowarrior *dev; int read_idx; int offset; dev = file->private_data; /* verify that the device wasn't unplugged */ if (!dev || !dev->present) return -ENODEV; dev_dbg(&dev->interface->dev, "minor %d, count = %zd\n", dev->minor, count); /* read count must be packet size (+ time stamp) */ if ((count != dev->report_size) && (count != (dev->report_size + 1))) return -EINVAL; /* repeat until no buffer overrun in callback handler occur */ do { atomic_set(&dev->overflow_flag, 0); if ((read_idx = read_index(dev)) == -1) { /* queue empty */ if (file->f_flags & O_NONBLOCK) return -EAGAIN; else { //next line will return when there is either new data, or the device is unplugged int r = wait_event_interruptible(dev->read_wait, (!dev->present || (read_idx = read_index (dev)) != -1)); if (r) { //we were interrupted by a signal return -ERESTART; } if (!dev->present) { //The device was unplugged return -ENODEV; } if (read_idx == -1) { // Can this happen ??? return 0; } } } offset = read_idx * (dev->report_size + 1); if (copy_to_user(buffer, dev->read_queue + offset, count)) { return -EFAULT; } } while (atomic_read(&dev->overflow_flag)); read_idx = ++read_idx == MAX_INTERRUPT_BUFFER ? 0 : read_idx; atomic_set(&dev->read_idx, read_idx); return count; } /* * iowarrior_write */ static ssize_t iowarrior_write(struct file *file, const char __user *user_buffer, size_t count, loff_t *ppos) { struct iowarrior *dev; int retval = 0; char *buf = NULL; /* for IOW24 and IOW56 we need a buffer */ struct urb *int_out_urb = NULL; dev = file->private_data; mutex_lock(&dev->mutex); /* verify that the device wasn't unplugged */ if (!dev->present) { retval = -ENODEV; goto exit; } dev_dbg(&dev->interface->dev, "minor %d, count = %zd\n", dev->minor, count); /* if count is 0 we're already done */ if (count == 0) { retval = 0; goto exit; } /* We only accept full reports */ if (count != dev->report_size) { retval = -EINVAL; goto exit; } switch (dev->product_id) { case USB_DEVICE_ID_CODEMERCS_IOW24: case USB_DEVICE_ID_CODEMERCS_IOW24SAG: case USB_DEVICE_ID_CODEMERCS_IOWPV1: case USB_DEVICE_ID_CODEMERCS_IOWPV2: case USB_DEVICE_ID_CODEMERCS_IOW40: /* IOW24 and IOW40 use a synchronous call */ buf = memdup_user(user_buffer, count); if (IS_ERR(buf)) { retval = PTR_ERR(buf); goto exit; } retval = usb_set_report(dev->interface, 2, 0, buf, count); kfree(buf); goto exit; case USB_DEVICE_ID_CODEMERCS_IOW56: case USB_DEVICE_ID_CODEMERCS_IOW56AM: case USB_DEVICE_ID_CODEMERCS_IOW28: case USB_DEVICE_ID_CODEMERCS_IOW28L: case USB_DEVICE_ID_CODEMERCS_IOW100: /* The IOW56 uses asynchronous IO and more urbs */ if (atomic_read(&dev->write_busy) == MAX_WRITES_IN_FLIGHT) { /* Wait until we are below the limit for submitted urbs */ if (file->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto exit; } else { retval = wait_event_interruptible(dev->write_wait, (!dev->present || (atomic_read (&dev-> write_busy) < MAX_WRITES_IN_FLIGHT))); if (retval) { /* we were interrupted by a signal */ retval = -ERESTART; goto exit; } if (!dev->present) { /* The device was unplugged */ retval = -ENODEV; goto exit; } if (!dev->opened) { /* We were closed while waiting for an URB */ retval = -ENODEV; goto exit; } } } atomic_inc(&dev->write_busy); int_out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!int_out_urb) { retval = -ENOMEM; goto error_no_urb; } buf = usb_alloc_coherent(dev->udev, dev->report_size, GFP_KERNEL, &int_out_urb->transfer_dma); if (!buf) { retval = -ENOMEM; dev_dbg(&dev->interface->dev, "Unable to allocate buffer\n"); goto error_no_buffer; } usb_fill_int_urb(int_out_urb, dev->udev, usb_sndintpipe(dev->udev, dev->int_out_endpoint->bEndpointAddress), buf, dev->report_size, iowarrior_write_callback, dev, dev->int_out_endpoint->bInterval); int_out_urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; if (copy_from_user(buf, user_buffer, count)) { retval = -EFAULT; goto error; } usb_anchor_urb(int_out_urb, &dev->submitted); retval = usb_submit_urb(int_out_urb, GFP_KERNEL); if (retval) { dev_dbg(&dev->interface->dev, "submit error %d for urb nr.%d\n", retval, atomic_read(&dev->write_busy)); usb_unanchor_urb(int_out_urb); goto error; } /* submit was ok */ retval = count; usb_free_urb(int_out_urb); goto exit; default: /* what do we have here ? An unsupported Product-ID ? */ dev_err(&dev->interface->dev, "%s - not supported for product=0x%x\n", __func__, dev->product_id); retval = -EFAULT; goto exit; } error: usb_free_coherent(dev->udev, dev->report_size, buf, int_out_urb->transfer_dma); error_no_buffer: usb_free_urb(int_out_urb); error_no_urb: atomic_dec(&dev->write_busy); wake_up_interruptible(&dev->write_wait); exit: mutex_unlock(&dev->mutex); return retval; } /* * iowarrior_ioctl */ static long iowarrior_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct iowarrior *dev = NULL; __u8 *buffer; __u8 __user *user_buffer; int retval; int io_res; /* checks for bytes read/written and copy_to/from_user results */ dev = file->private_data; if (!dev) return -ENODEV; buffer = kzalloc(dev->report_size, GFP_KERNEL); if (!buffer) return -ENOMEM; mutex_lock(&dev->mutex); /* verify that the device wasn't unplugged */ if (!dev->present) { retval = -ENODEV; goto error_out; } dev_dbg(&dev->interface->dev, "minor %d, cmd 0x%.4x, arg %ld\n", dev->minor, cmd, arg); retval = 0; io_res = 0; switch (cmd) { case IOW_WRITE: if (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW24 || dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW24SAG || dev->product_id == USB_DEVICE_ID_CODEMERCS_IOWPV1 || dev->product_id == USB_DEVICE_ID_CODEMERCS_IOWPV2 || dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW40) { user_buffer = (__u8 __user *)arg; io_res = copy_from_user(buffer, user_buffer, dev->report_size); if (io_res) { retval = -EFAULT; } else { io_res = usb_set_report(dev->interface, 2, 0, buffer, dev->report_size); if (io_res < 0) retval = io_res; } } else { retval = -EINVAL; dev_err(&dev->interface->dev, "ioctl 'IOW_WRITE' is not supported for product=0x%x.\n", dev->product_id); } break; case IOW_READ: user_buffer = (__u8 __user *)arg; io_res = usb_get_report(dev->udev, dev->interface->cur_altsetting, 1, 0, buffer, dev->report_size); if (io_res < 0) retval = io_res; else { io_res = copy_to_user(user_buffer, buffer, dev->report_size); if (io_res) retval = -EFAULT; } break; case IOW_GETINFO: { /* Report available information for the device */ struct iowarrior_info info; /* needed for power consumption */ struct usb_config_descriptor *cfg_descriptor = &dev->udev->actconfig->desc; memset(&info, 0, sizeof(info)); /* directly from the descriptor */ info.vendor = le16_to_cpu(dev->udev->descriptor.idVendor); info.product = dev->product_id; info.revision = le16_to_cpu(dev->udev->descriptor.bcdDevice); /* 0==UNKNOWN, 1==LOW(usb1.1) ,2=FULL(usb1.1), 3=HIGH(usb2.0) */ info.speed = dev->udev->speed; info.if_num = dev->interface->cur_altsetting->desc.bInterfaceNumber; info.report_size = dev->report_size; /* serial number string has been read earlier 8 chars or empty string */ memcpy(info.serial, dev->chip_serial, sizeof(dev->chip_serial)); if (cfg_descriptor == NULL) { info.power = -1; /* no information available */ } else { /* the MaxPower is stored in units of 2mA to make it fit into a byte-value */ info.power = cfg_descriptor->bMaxPower * 2; } io_res = copy_to_user((struct iowarrior_info __user *)arg, &info, sizeof(struct iowarrior_info)); if (io_res) retval = -EFAULT; break; } default: /* return that we did not understand this ioctl call */ retval = -ENOTTY; break; } error_out: /* unlock the device */ mutex_unlock(&dev->mutex); kfree(buffer); return retval; } /* * iowarrior_open */ static int iowarrior_open(struct inode *inode, struct file *file) { struct iowarrior *dev = NULL; struct usb_interface *interface; int subminor; int retval = 0; subminor = iminor(inode); interface = usb_find_interface(&iowarrior_driver, subminor); if (!interface) { pr_err("%s - error, can't find device for minor %d\n", __func__, subminor); return -ENODEV; } dev = usb_get_intfdata(interface); if (!dev) return -ENODEV; mutex_lock(&dev->mutex); /* Only one process can open each device, no sharing. */ if (dev->opened) { retval = -EBUSY; goto out; } /* setup interrupt handler for receiving values */ if ((retval = usb_submit_urb(dev->int_in_urb, GFP_KERNEL)) < 0) { dev_err(&interface->dev, "Error %d while submitting URB\n", retval); retval = -EFAULT; goto out; } /* increment our usage count for the driver */ ++dev->opened; /* save our object in the file's private structure */ file->private_data = dev; retval = 0; out: mutex_unlock(&dev->mutex); return retval; } /* * iowarrior_release */ static int iowarrior_release(struct inode *inode, struct file *file) { struct iowarrior *dev; int retval = 0; dev = file->private_data; if (!dev) return -ENODEV; dev_dbg(&dev->interface->dev, "minor %d\n", dev->minor); /* lock our device */ mutex_lock(&dev->mutex); if (dev->opened <= 0) { retval = -ENODEV; /* close called more than once */ mutex_unlock(&dev->mutex); } else { dev->opened = 0; /* we're closing now */ retval = 0; if (dev->present) { /* The device is still connected so we only shutdown pending read-/write-ops. */ usb_kill_urb(dev->int_in_urb); wake_up_interruptible(&dev->read_wait); wake_up_interruptible(&dev->write_wait); mutex_unlock(&dev->mutex); } else { /* The device was unplugged, cleanup resources */ mutex_unlock(&dev->mutex); iowarrior_delete(dev); } } return retval; } static __poll_t iowarrior_poll(struct file *file, poll_table * wait) { struct iowarrior *dev = file->private_data; __poll_t mask = 0; if (!dev->present) return EPOLLERR | EPOLLHUP; poll_wait(file, &dev->read_wait, wait); poll_wait(file, &dev->write_wait, wait); if (!dev->present) return EPOLLERR | EPOLLHUP; if (read_index(dev) != -1) mask |= EPOLLIN | EPOLLRDNORM; if (atomic_read(&dev->write_busy) < MAX_WRITES_IN_FLIGHT) mask |= EPOLLOUT | EPOLLWRNORM; return mask; } /* * File operations needed when we register this driver. * This assumes that this driver NEEDS file operations, * of course, which means that the driver is expected * to have a node in the /dev directory. If the USB * device were for a network interface then the driver * would use "struct net_driver" instead, and a serial * device would use "struct tty_driver". */ static const struct file_operations iowarrior_fops = { .owner = THIS_MODULE, .write = iowarrior_write, .read = iowarrior_read, .unlocked_ioctl = iowarrior_ioctl, .open = iowarrior_open, .release = iowarrior_release, .poll = iowarrior_poll, .llseek = noop_llseek, }; static char *iowarrior_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } /* * usb class driver info in order to get a minor number from the usb core, * and to have the device registered with devfs and the driver core */ static struct usb_class_driver iowarrior_class = { .name = "iowarrior%d", .devnode = iowarrior_devnode, .fops = &iowarrior_fops, .minor_base = IOWARRIOR_MINOR_BASE, }; /*---------------------------------*/ /* probe and disconnect functions */ /*---------------------------------*/ /* * iowarrior_probe * * Called by the usb core when a new device is connected that it thinks * this driver might be interested in. */ static int iowarrior_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(interface); struct iowarrior *dev = NULL; struct usb_host_interface *iface_desc; int retval = -ENOMEM; int res; /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(struct iowarrior), GFP_KERNEL); if (!dev) return retval; mutex_init(&dev->mutex); atomic_set(&dev->intr_idx, 0); atomic_set(&dev->read_idx, 0); atomic_set(&dev->overflow_flag, 0); init_waitqueue_head(&dev->read_wait); atomic_set(&dev->write_busy, 0); init_waitqueue_head(&dev->write_wait); dev->udev = udev; dev->interface = usb_get_intf(interface); iface_desc = interface->cur_altsetting; dev->product_id = le16_to_cpu(udev->descriptor.idProduct); init_usb_anchor(&dev->submitted); res = usb_find_last_int_in_endpoint(iface_desc, &dev->int_in_endpoint); if (res) { dev_err(&interface->dev, "no interrupt-in endpoint found\n"); retval = res; goto error; } if ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) || (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM) || (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28) || (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L) || (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW100)) { res = usb_find_last_int_out_endpoint(iface_desc, &dev->int_out_endpoint); if (res) { dev_err(&interface->dev, "no interrupt-out endpoint found\n"); retval = res; goto error; } } /* we have to check the report_size often, so remember it in the endianness suitable for our machine */ dev->report_size = usb_endpoint_maxp(dev->int_in_endpoint); /* * Some devices need the report size to be different than the * endpoint size. */ if (dev->interface->cur_altsetting->desc.bInterfaceNumber == 0) { switch (dev->product_id) { case USB_DEVICE_ID_CODEMERCS_IOW56: case USB_DEVICE_ID_CODEMERCS_IOW56AM: dev->report_size = 7; break; case USB_DEVICE_ID_CODEMERCS_IOW28: case USB_DEVICE_ID_CODEMERCS_IOW28L: dev->report_size = 4; break; case USB_DEVICE_ID_CODEMERCS_IOW100: dev->report_size = 12; break; } } /* create the urb and buffer for reading */ dev->int_in_urb = usb_alloc_urb(0, GFP_KERNEL); if (!dev->int_in_urb) goto error; dev->int_in_buffer = kmalloc(dev->report_size, GFP_KERNEL); if (!dev->int_in_buffer) goto error; usb_fill_int_urb(dev->int_in_urb, dev->udev, usb_rcvintpipe(dev->udev, dev->int_in_endpoint->bEndpointAddress), dev->int_in_buffer, dev->report_size, iowarrior_callback, dev, dev->int_in_endpoint->bInterval); /* create an internal buffer for interrupt data from the device */ dev->read_queue = kmalloc_array(dev->report_size + 1, MAX_INTERRUPT_BUFFER, GFP_KERNEL); if (!dev->read_queue) goto error; /* Get the serial-number of the chip */ memset(dev->chip_serial, 0x00, sizeof(dev->chip_serial)); usb_string(udev, udev->descriptor.iSerialNumber, dev->chip_serial, sizeof(dev->chip_serial)); if (strlen(dev->chip_serial) != 8) memset(dev->chip_serial, 0x00, sizeof(dev->chip_serial)); /* Set the idle timeout to 0, if this is interface 0 */ if (dev->interface->cur_altsetting->desc.bInterfaceNumber == 0) { usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x0A, USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); } /* allow device read and ioctl */ dev->present = 1; /* we can register the device now, as it is ready */ usb_set_intfdata(interface, dev); retval = usb_register_dev(interface, &iowarrior_class); if (retval) { /* something prevented us from registering this driver */ dev_err(&interface->dev, "Not able to get a minor for this device.\n"); goto error; } dev->minor = interface->minor; /* let the user know what node this device is now attached to */ dev_info(&interface->dev, "IOWarrior product=0x%x, serial=%s interface=%d " "now attached to iowarrior%d\n", dev->product_id, dev->chip_serial, iface_desc->desc.bInterfaceNumber, dev->minor - IOWARRIOR_MINOR_BASE); return retval; error: iowarrior_delete(dev); return retval; } /* * iowarrior_disconnect * * Called by the usb core when the device is removed from the system. */ static void iowarrior_disconnect(struct usb_interface *interface) { struct iowarrior *dev = usb_get_intfdata(interface); int minor = dev->minor; usb_deregister_dev(interface, &iowarrior_class); mutex_lock(&dev->mutex); /* prevent device read, write and ioctl */ dev->present = 0; if (dev->opened) { /* There is a process that holds a filedescriptor to the device , so we only shutdown read-/write-ops going on. Deleting the device is postponed until close() was called. */ usb_kill_urb(dev->int_in_urb); usb_kill_anchored_urbs(&dev->submitted); wake_up_interruptible(&dev->read_wait); wake_up_interruptible(&dev->write_wait); mutex_unlock(&dev->mutex); } else { /* no process is using the device, cleanup now */ mutex_unlock(&dev->mutex); iowarrior_delete(dev); } dev_info(&interface->dev, "I/O-Warror #%d now disconnected\n", minor - IOWARRIOR_MINOR_BASE); } /* usb specific object needed to register this driver with the usb subsystem */ static struct usb_driver iowarrior_driver = { .name = "iowarrior", .probe = iowarrior_probe, .disconnect = iowarrior_disconnect, .id_table = iowarrior_ids, }; module_usb_driver(iowarrior_driver);
7 23 29 9 24 8 19 16 22 3 10 3 3 9 7 20 2 1 1 10 2 7 3 3 7 6 1 4 1 3 42 8 1 8 11 1 1 9 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 // SPDX-License-Identifier: GPL-2.0-only /* * fs/eventfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/anon_inodes.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/kref.h> #include <linux/eventfd.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/idr.h> #include <linux/uio.h> static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { struct kref kref; wait_queue_head_t wqh; /* * Every time that a write(2) is performed on an eventfd, the * value of the __u64 being written is added to "count" and a * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not * specified, a read(2) will return the "count" value to userspace, * and will reset "count" to zero. The kernel side eventfd_signal() * also, adds to the "count" counter and issue a wakeup. */ __u64 count; unsigned int flags; int id; }; __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) { unsigned long flags; /* * Deadlock or stack overflow issues can happen if we recurse here * through waitqueue wakeup handlers. If the caller users potentially * nested waitqueues with custom wakeup handlers, then it should * check eventfd_signal_allowed() before calling this function. If * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ if (WARN_ON_ONCE(current->in_eventfd)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); current->in_eventfd = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; } /** * eventfd_signal - Adds @n to the eventfd counter. * @ctx: [in] Pointer to the eventfd context. * @n: [in] Value of the counter to be added to the eventfd internal counter. * The value cannot be negative. * * This function is supposed to be called by the kernel in paths that do not * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX * value, and we signal this as overflow condition by returning a EPOLLERR * to poll(2). * * Returns the amount by which the counter was incremented. This will be less * than @n if the counter has overflowed. */ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) { return eventfd_signal_mask(ctx, n, 0); } EXPORT_SYMBOL_GPL(eventfd_signal); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { if (ctx->id >= 0) ida_simple_remove(&eventfd_ida, ctx->id); kfree(ctx); } static void eventfd_free(struct kref *kref) { struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); eventfd_free_ctx(ctx); } /** * eventfd_ctx_put - Releases a reference to the internal eventfd context. * @ctx: [in] Pointer to eventfd context. * * The eventfd context reference must have been previously acquired either * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). */ void eventfd_ctx_put(struct eventfd_ctx *ctx) { kref_put(&ctx->kref, eventfd_free); } EXPORT_SYMBOL_GPL(eventfd_ctx_put); static int eventfd_release(struct inode *inode, struct file *file) { struct eventfd_ctx *ctx = file->private_data; wake_up_poll(&ctx->wqh, EPOLLHUP); eventfd_ctx_put(ctx); return 0; } static __poll_t eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; poll_wait(file, &ctx->wqh, wait); /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait * takes that lock (through add_wait_queue) if our caller will sleep. * * The read _can_ therefore seep into add_wait_queue's critical * section, but cannot move above it! add_wait_queue's spin_lock acts * as an acquire barrier and ensures that the read be ordered properly * against the writes. The following CAN happen and is safe: * * poll write * ----------------- ------------ * lock ctx->wqh.lock (in poll_wait) * count = ctx->count * __add_wait_queue * unlock ctx->wqh.lock * lock ctx->qwh.lock * ctx->count += n * if (waitqueue_active) * wake_up_locked_poll * unlock ctx->qwh.lock * eventfd_poll returns 0 * * but the following, which would miss a wakeup, cannot happen: * * poll write * ----------------- ------------ * count = ctx->count (INVALID!) * lock ctx->qwh.lock * ctx->count += n * **waitqueue_active is false** * **no wake_up_locked_poll!** * unlock ctx->qwh.lock * lock ctx->wqh.lock (in poll_wait) * __add_wait_queue * unlock ctx->wqh.lock * eventfd_poll returns 0 */ count = READ_ONCE(ctx->count); if (count > 0) events |= EPOLLIN; if (count == ULLONG_MAX) events |= EPOLLERR; if (ULLONG_MAX - 1 > count) events |= EPOLLOUT; return events; } void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { lockdep_assert_held(&ctx->wqh.lock); *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count; ctx->count -= *cnt; } EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); /** * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. * @ctx: [in] Pointer to eventfd context. * @wait: [in] Wait queue to be removed. * @cnt: [out] Pointer to the 64-bit counter value. * * Returns %0 if successful, or the following error codes: * * -EAGAIN : The operation would have blocked. * * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); eventfd_ctx_do_read(ctx, cnt); __remove_wait_queue(&ctx->wqh, wait); if (*cnt != 0 && waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return *cnt != 0 ? 0 : -EAGAIN; } EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct eventfd_ctx *ctx = file->private_data; __u64 ucnt = 0; if (iov_iter_count(to) < sizeof(ucnt)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (!ctx->count) { if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { spin_unlock_irq(&ctx->wqh.lock); return -EAGAIN; } if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) { spin_unlock_irq(&ctx->wqh.lock); return -ERESTARTSYS; } } eventfd_ctx_do_read(ctx, &ucnt); current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); current->in_eventfd = 0; spin_unlock_irq(&ctx->wqh.lock); if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) return -EFAULT; return sizeof(ucnt); } static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; if (count < sizeof(ucnt)) return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; if (ucnt == ULLONG_MAX) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; if (ULLONG_MAX - ctx->count > ucnt) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { res = wait_event_interruptible_locked_irq(ctx->wqh, ULLONG_MAX - ctx->count > ucnt); if (!res) res = sizeof(ucnt); } if (likely(res > 0)) { ctx->count += ucnt; current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); current->in_eventfd = 0; } spin_unlock_irq(&ctx->wqh.lock); return res; } #ifdef CONFIG_PROC_FS static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; spin_lock_irq(&ctx->wqh.lock); seq_printf(m, "eventfd-count: %16llx\n", (unsigned long long)ctx->count); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "eventfd-id: %d\n", ctx->id); seq_printf(m, "eventfd-semaphore: %d\n", !!(ctx->flags & EFD_SEMAPHORE)); } #endif static const struct file_operations eventfd_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, .poll = eventfd_poll, .read_iter = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, }; /** * eventfd_fget - Acquire a reference of an eventfd file descriptor. * @fd: [in] Eventfd file descriptor. * * Returns a pointer to the eventfd file structure in case of success, or the * following error pointer: * * -EBADF : Invalid @fd file descriptor. * -EINVAL : The @fd file descriptor is not an eventfd file. */ struct file *eventfd_fget(int fd) { struct file *file; file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &eventfd_fops) { fput(file); return ERR_PTR(-EINVAL); } return file; } EXPORT_SYMBOL_GPL(eventfd_fget); /** * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. * @fd: [in] Eventfd file descriptor. * * Returns a pointer to the internal eventfd context, otherwise the error * pointers returned by the following functions: * * eventfd_fget */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { struct eventfd_ctx *ctx; struct fd f = fdget(fd); if (!f.file) return ERR_PTR(-EBADF); ctx = eventfd_ctx_fileget(f.file); fdput(f); return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); /** * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. * @file: [in] Eventfd file pointer. * * Returns a pointer to the internal eventfd context, otherwise the error * pointer: * * -EINVAL : The @fd file descriptor is not an eventfd file. */ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) { struct eventfd_ctx *ctx; if (file->f_op != &eventfd_fops) return ERR_PTR(-EINVAL); ctx = file->private_data; kref_get(&ctx->kref); return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); static int do_eventfd(unsigned int count, int flags) { struct eventfd_ctx *ctx; struct file *file; int fd; /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); if (flags & ~EFD_FLAGS_SET) return -EINVAL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; kref_init(&ctx->kref); init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; fd = get_unused_fd_flags(flags); if (fd < 0) goto err; file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); goto err; } file->f_mode |= FMODE_NOWAIT; fd_install(fd, file); return fd; err: eventfd_free_ctx(ctx); return fd; } SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { return do_eventfd(count, flags); } SYSCALL_DEFINE1(eventfd, unsigned int, count) { return do_eventfd(count, 0); }
125 5 2 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../kernel/futex/futex.h" #include "io_uring.h" #include "rsrc.h" #include "futex.h" struct io_futex { struct file *file; union { u32 __user *uaddr; struct futex_waitv __user *uwaitv; }; unsigned long futex_val; unsigned long futex_mask; unsigned long futexv_owned; u32 futex_flags; unsigned int futex_nr; bool futexv_unqueued; }; struct io_futex_data { union { struct futex_q q; struct io_cache_entry cache; }; struct io_kiocb *req; }; void io_futex_cache_init(struct io_ring_ctx *ctx) { io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_futex_data)); } static void io_futex_cache_entry_free(struct io_cache_entry *entry) { kfree(container_of(entry, struct io_futex_data, cache)); } void io_futex_cache_free(struct io_ring_ctx *ctx) { io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free); } static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) { req->async_data = NULL; hlist_del_init(&req->hash_node); io_req_task_complete(req, ts); } static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) { struct io_futex_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, ts); if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache)) kfree(ifd); __io_futex_complete(req, ts); } static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; io_tw_lock(req->ctx, ts); if (!iof->futexv_unqueued) { int res; res = futex_unqueue_multiple(futexv, iof->futex_nr); if (res != -1) io_req_set_res(req, res, 0); } kfree(req->async_data); req->flags &= ~REQ_F_ASYNC_DATA; __io_futex_complete(req, ts); } static bool io_futexv_claim(struct io_futex *iof) { if (test_bit(0, &iof->futexv_owned) || test_and_set_bit_lock(0, &iof->futexv_owned)) return false; return true; } static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) { /* futex wake already done or in progress */ if (req->opcode == IORING_OP_FUTEX_WAIT) { struct io_futex_data *ifd = req->async_data; if (!futex_unqueue(&ifd->q)) return false; req->io_task_work.func = io_futex_complete; } else { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); if (!io_futexv_claim(iof)) return false; req->io_task_work.func = io_futexv_complete; } hlist_del_init(&req->hash_node); io_req_set_res(req, -ECANCELED, 0); io_req_task_work_add(req); return true; } int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { struct hlist_node *tmp; struct io_kiocb *req; int nr = 0; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) return -ENOENT; io_ring_submit_lock(ctx, issue_flags); hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { if (req->cqe.user_data != cd->data && !(cd->flags & IORING_ASYNC_CANCEL_ANY)) continue; if (__io_futex_cancel(ctx, req)) nr++; if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) break; } io_ring_submit_unlock(ctx, issue_flags); if (nr) return nr; return -ENOENT; } bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; bool found = false; lockdep_assert_held(&ctx->uring_lock); hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { if (!io_match_task_safe(req, task, cancel_all)) continue; __io_futex_cancel(ctx, req); found = true; } return found; } int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); u32 flags; if (unlikely(sqe->len || sqe->futex_flags || sqe->buf_index || sqe->file_index)) return -EINVAL; iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); iof->futex_val = READ_ONCE(sqe->addr2); iof->futex_mask = READ_ONCE(sqe->addr3); flags = READ_ONCE(sqe->fd); if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; iof->futex_flags = futex2_to_flags(flags); if (!futex_flags_valid(iof->futex_flags)) return -EINVAL; if (!futex_validate_input(iof->futex_flags, iof->futex_val) || !futex_validate_input(iof->futex_flags, iof->futex_mask)) return -EINVAL; return 0; } static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) { struct io_kiocb *req = q->wake_data; struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); if (!io_futexv_claim(iof)) return; if (unlikely(!__futex_wake_mark(q))) return; io_req_set_res(req, 0, 0); req->io_task_work.func = io_futexv_complete; io_req_task_work_add(req); } int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv; int ret; /* No flags or mask supported for waitv */ if (unlikely(sqe->fd || sqe->buf_index || sqe->file_index || sqe->addr2 || sqe->futex_flags || sqe->addr3)) return -EINVAL; iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); iof->futex_nr = READ_ONCE(sqe->len); if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX) return -EINVAL; futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL); if (!futexv) return -ENOMEM; ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr, io_futex_wakev_fn, req); if (ret) { kfree(futexv); return ret; } iof->futexv_owned = 0; iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; req->async_data = futexv; return 0; } static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) { struct io_futex_data *ifd = container_of(q, struct io_futex_data, q); struct io_kiocb *req = ifd->req; if (unlikely(!__futex_wake_mark(q))) return; io_req_set_res(req, 0, 0); req->io_task_work.func = io_futex_complete; io_req_task_work_add(req); } static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) { struct io_cache_entry *entry; entry = io_alloc_cache_get(&ctx->futex_cache); if (entry) return container_of(entry, struct io_futex_data, cache); return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); } int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret, woken = -1; io_ring_submit_lock(ctx, issue_flags); ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken); /* * Error case, ret is < 0. Mark the request as failed. */ if (unlikely(ret < 0)) { io_ring_submit_unlock(ctx, issue_flags); req_set_fail(req); io_req_set_res(req, ret, 0); kfree(futexv); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; return IOU_OK; } /* * 0 return means that we successfully setup the waiters, and that * nobody triggered a wakeup while we were doing so. If the wakeup * happened post setup, the task_work will be run post this issue and * under the submission lock. 1 means We got woken while setting up, * let that side do the completion. Note that * futex_wait_multiple_setup() will have unqueued all the futexes in * this case. Mark us as having done that already, since this is * different from normal wakeup. */ if (!ret) { /* * If futex_wait_multiple_setup() returns 0 for a * successful setup, then the task state will not be * runnable. This is fine for the sync syscall, as * it'll be blocking unless we already got one of the * futexes woken, but it obviously won't work for an * async invocation. Mark us runnable again. */ __set_current_state(TASK_RUNNING); hlist_add_head(&req->hash_node, &ctx->futex_list); } else { iof->futexv_unqueued = 1; if (woken != -1) io_req_set_res(req, woken, 0); } io_ring_submit_unlock(ctx, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct io_ring_ctx *ctx = req->ctx; struct io_futex_data *ifd = NULL; struct futex_hash_bucket *hb; int ret; if (!iof->futex_mask) { ret = -EINVAL; goto done; } io_ring_submit_lock(ctx, issue_flags); ifd = io_alloc_ifd(ctx); if (!ifd) { ret = -ENOMEM; goto done_unlock; } req->async_data = ifd; ifd->q = futex_q_init; ifd->q.bitset = iof->futex_mask; ifd->q.wake = io_futex_wake_fn; ifd->req = req; ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags, &ifd->q, &hb); if (!ret) { hlist_add_head(&req->hash_node, &ctx->futex_list); io_ring_submit_unlock(ctx, issue_flags); futex_queue(&ifd->q, hb); return IOU_ISSUE_SKIP_COMPLETE; } done_unlock: io_ring_submit_unlock(ctx, issue_flags); done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); kfree(ifd); return IOU_OK; } int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); int ret; /* * Strict flags - ensure that waking 0 futexes yields a 0 result. * See commit 43adf8449510 ("futex: FLAGS_STRICT") for details. */ ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags, iof->futex_val, iof->futex_mask); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; }
18 3 4 4 4 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fsnotify_backend.h> #include <linux/path.h> #include <linux/slab.h> #include <linux/exportfs.h> #include <linux/hashtable.h> extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_fid_event_cachep; extern struct kmem_cache *fanotify_path_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; /* Possible states of the permission event */ enum { FAN_EVENT_INIT, FAN_EVENT_REPORTED, FAN_EVENT_ANSWERED, FAN_EVENT_CANCELED, }; /* * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation). * fh buf should be dword aligned. On 64bit arch, the ext_buf pointer is * stored in either the first or last 2 dwords. */ #define FANOTIFY_INLINE_FH_LEN (3 << 2) #define FANOTIFY_FH_HDR_LEN offsetof(struct fanotify_fh, buf) /* Fixed size struct for file handle */ struct fanotify_fh { u8 type; u8 len; #define FANOTIFY_FH_FLAG_EXT_BUF 1 u8 flags; u8 pad; unsigned char buf[]; } __aligned(4); /* Variable size struct for dir file handle + child file handle + name */ struct fanotify_info { /* size of dir_fh/file_fh including fanotify_fh hdr size */ u8 dir_fh_totlen; u8 dir2_fh_totlen; u8 file_fh_totlen; u8 name_len; u8 name2_len; u8 pad[3]; unsigned char buf[]; /* * (struct fanotify_fh) dir_fh starts at buf[0] * (optional) dir2_fh starts at buf[dir_fh_totlen] * (optional) file_fh starts at buf[dir_fh_totlen + dir2_fh_totlen] * name starts at buf[dir_fh_totlen + dir2_fh_totlen + file_fh_totlen] * ... */ #define FANOTIFY_DIR_FH_SIZE(info) ((info)->dir_fh_totlen) #define FANOTIFY_DIR2_FH_SIZE(info) ((info)->dir2_fh_totlen) #define FANOTIFY_FILE_FH_SIZE(info) ((info)->file_fh_totlen) #define FANOTIFY_NAME_SIZE(info) ((info)->name_len + 1) #define FANOTIFY_NAME2_SIZE(info) ((info)->name2_len + 1) #define FANOTIFY_DIR_FH_OFFSET(info) 0 #define FANOTIFY_DIR2_FH_OFFSET(info) \ (FANOTIFY_DIR_FH_OFFSET(info) + FANOTIFY_DIR_FH_SIZE(info)) #define FANOTIFY_FILE_FH_OFFSET(info) \ (FANOTIFY_DIR2_FH_OFFSET(info) + FANOTIFY_DIR2_FH_SIZE(info)) #define FANOTIFY_NAME_OFFSET(info) \ (FANOTIFY_FILE_FH_OFFSET(info) + FANOTIFY_FILE_FH_SIZE(info)) #define FANOTIFY_NAME2_OFFSET(info) \ (FANOTIFY_NAME_OFFSET(info) + FANOTIFY_NAME_SIZE(info)) #define FANOTIFY_DIR_FH_BUF(info) \ ((info)->buf + FANOTIFY_DIR_FH_OFFSET(info)) #define FANOTIFY_DIR2_FH_BUF(info) \ ((info)->buf + FANOTIFY_DIR2_FH_OFFSET(info)) #define FANOTIFY_FILE_FH_BUF(info) \ ((info)->buf + FANOTIFY_FILE_FH_OFFSET(info)) #define FANOTIFY_NAME_BUF(info) \ ((info)->buf + FANOTIFY_NAME_OFFSET(info)) #define FANOTIFY_NAME2_BUF(info) \ ((info)->buf + FANOTIFY_NAME2_OFFSET(info)) } __aligned(4); static inline bool fanotify_fh_has_ext_buf(struct fanotify_fh *fh) { return (fh->flags & FANOTIFY_FH_FLAG_EXT_BUF); } static inline char **fanotify_fh_ext_buf_ptr(struct fanotify_fh *fh) { BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN % 4); BUILD_BUG_ON(__alignof__(char *) - 4 + sizeof(char *) > FANOTIFY_INLINE_FH_LEN); return (char **)ALIGN((unsigned long)(fh->buf), __alignof__(char *)); } static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh) { return *fanotify_fh_ext_buf_ptr(fh); } static inline void *fanotify_fh_buf(struct fanotify_fh *fh) { return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh->buf; } static inline int fanotify_info_dir_fh_len(struct fanotify_info *info) { if (!info->dir_fh_totlen || WARN_ON_ONCE(info->dir_fh_totlen < FANOTIFY_FH_HDR_LEN)) return 0; return info->dir_fh_totlen - FANOTIFY_FH_HDR_LEN; } static inline struct fanotify_fh *fanotify_info_dir_fh(struct fanotify_info *info) { BUILD_BUG_ON(offsetof(struct fanotify_info, buf) % 4); return (struct fanotify_fh *)FANOTIFY_DIR_FH_BUF(info); } static inline int fanotify_info_dir2_fh_len(struct fanotify_info *info) { if (!info->dir2_fh_totlen || WARN_ON_ONCE(info->dir2_fh_totlen < FANOTIFY_FH_HDR_LEN)) return 0; return info->dir2_fh_totlen - FANOTIFY_FH_HDR_LEN; } static inline struct fanotify_fh *fanotify_info_dir2_fh(struct fanotify_info *info) { return (struct fanotify_fh *)FANOTIFY_DIR2_FH_BUF(info); } static inline int fanotify_info_file_fh_len(struct fanotify_info *info) { if (!info->file_fh_totlen || WARN_ON_ONCE(info->file_fh_totlen < FANOTIFY_FH_HDR_LEN)) return 0; return info->file_fh_totlen - FANOTIFY_FH_HDR_LEN; } static inline struct fanotify_fh *fanotify_info_file_fh(struct fanotify_info *info) { return (struct fanotify_fh *)FANOTIFY_FILE_FH_BUF(info); } static inline char *fanotify_info_name(struct fanotify_info *info) { if (!info->name_len) return NULL; return FANOTIFY_NAME_BUF(info); } static inline char *fanotify_info_name2(struct fanotify_info *info) { if (!info->name2_len) return NULL; return FANOTIFY_NAME2_BUF(info); } static inline void fanotify_info_init(struct fanotify_info *info) { BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN + MAX_HANDLE_SZ > U8_MAX); BUILD_BUG_ON(NAME_MAX > U8_MAX); info->dir_fh_totlen = 0; info->dir2_fh_totlen = 0; info->file_fh_totlen = 0; info->name_len = 0; info->name2_len = 0; } /* These set/copy helpers MUST be called by order */ static inline void fanotify_info_set_dir_fh(struct fanotify_info *info, unsigned int totlen) { if (WARN_ON_ONCE(info->dir2_fh_totlen > 0) || WARN_ON_ONCE(info->file_fh_totlen > 0) || WARN_ON_ONCE(info->name_len > 0) || WARN_ON_ONCE(info->name2_len > 0)) return; info->dir_fh_totlen = totlen; } static inline void fanotify_info_set_dir2_fh(struct fanotify_info *info, unsigned int totlen) { if (WARN_ON_ONCE(info->file_fh_totlen > 0) || WARN_ON_ONCE(info->name_len > 0) || WARN_ON_ONCE(info->name2_len > 0)) return; info->dir2_fh_totlen = totlen; } static inline void fanotify_info_set_file_fh(struct fanotify_info *info, unsigned int totlen) { if (WARN_ON_ONCE(info->name_len > 0) || WARN_ON_ONCE(info->name2_len > 0)) return; info->file_fh_totlen = totlen; } static inline void fanotify_info_copy_name(struct fanotify_info *info, const struct qstr *name) { if (WARN_ON_ONCE(name->len > NAME_MAX) || WARN_ON_ONCE(info->name2_len > 0)) return; info->name_len = name->len; strcpy(fanotify_info_name(info), name->name); } static inline void fanotify_info_copy_name2(struct fanotify_info *info, const struct qstr *name) { if (WARN_ON_ONCE(name->len > NAME_MAX)) return; info->name2_len = name->len; strcpy(fanotify_info_name2(info), name->name); } /* * Common structure for fanotify events. Concrete structs are allocated in * fanotify_handle_event() and freed when the information is retrieved by * userspace. The type of event determines how it was allocated, how it will * be freed and which concrete struct it may be cast to. */ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_FID, /* fixed length */ FANOTIFY_EVENT_TYPE_FID_NAME, /* variable length */ FANOTIFY_EVENT_TYPE_PATH, FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ __FANOTIFY_EVENT_TYPE_NUM }; #define FANOTIFY_EVENT_TYPE_BITS \ (ilog2(__FANOTIFY_EVENT_TYPE_NUM - 1) + 1) #define FANOTIFY_EVENT_HASH_BITS \ (32 - FANOTIFY_EVENT_TYPE_BITS) struct fanotify_event { struct fsnotify_event fse; struct hlist_node merge_list; /* List for hashed merge */ u32 mask; struct { unsigned int type : FANOTIFY_EVENT_TYPE_BITS; unsigned int hash : FANOTIFY_EVENT_HASH_BITS; }; struct pid *pid; }; static inline void fanotify_init_event(struct fanotify_event *event, unsigned int hash, u32 mask) { fsnotify_init_event(&event->fse); INIT_HLIST_NODE(&event->merge_list); event->hash = hash; event->mask = mask; event->pid = NULL; } #define FANOTIFY_INLINE_FH(name, size) \ struct { \ struct fanotify_fh name; \ /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \ unsigned char _inline_fh_buf[size]; \ } struct fanotify_fid_event { struct fanotify_event fae; __kernel_fsid_t fsid; FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN); }; static inline struct fanotify_fid_event * FANOTIFY_FE(struct fanotify_event *event) { return container_of(event, struct fanotify_fid_event, fae); } struct fanotify_name_event { struct fanotify_event fae; __kernel_fsid_t fsid; struct fanotify_info info; }; static inline struct fanotify_name_event * FANOTIFY_NE(struct fanotify_event *event) { return container_of(event, struct fanotify_name_event, fae); } struct fanotify_error_event { struct fanotify_event fae; s32 error; /* Error reported by the Filesystem. */ u32 err_count; /* Suppressed errors count */ __kernel_fsid_t fsid; /* FSID this error refers to. */ FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ); }; static inline struct fanotify_error_event * FANOTIFY_EE(struct fanotify_event *event) { return container_of(event, struct fanotify_error_event, fae); } static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_FID) return &FANOTIFY_FE(event)->fsid; else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return &FANOTIFY_NE(event)->fsid; else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) return &FANOTIFY_EE(event)->fsid; else return NULL; } static inline struct fanotify_fh *fanotify_event_object_fh( struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_FID) return &FANOTIFY_FE(event)->object_fh; else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return fanotify_info_file_fh(&FANOTIFY_NE(event)->info); else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) return &FANOTIFY_EE(event)->object_fh; else return NULL; } static inline struct fanotify_info *fanotify_event_info( struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return &FANOTIFY_NE(event)->info; else return NULL; } static inline int fanotify_event_object_fh_len(struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); struct fanotify_fh *fh = fanotify_event_object_fh(event); if (info) return info->file_fh_totlen ? fh->len : 0; else return fh ? fh->len : 0; } static inline int fanotify_event_dir_fh_len(struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); return info ? fanotify_info_dir_fh_len(info) : 0; } static inline int fanotify_event_dir2_fh_len(struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); return info ? fanotify_info_dir2_fh_len(info) : 0; } static inline bool fanotify_event_has_object_fh(struct fanotify_event *event) { /* For error events, even zeroed fh are reported. */ if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) return true; return fanotify_event_object_fh_len(event) > 0; } static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event) { return fanotify_event_dir_fh_len(event) > 0; } static inline bool fanotify_event_has_dir2_fh(struct fanotify_event *event) { return fanotify_event_dir2_fh_len(event) > 0; } static inline bool fanotify_event_has_any_dir_fh(struct fanotify_event *event) { return fanotify_event_has_dir_fh(event) || fanotify_event_has_dir2_fh(event); } struct fanotify_path_event { struct fanotify_event fae; struct path path; }; static inline struct fanotify_path_event * FANOTIFY_PE(struct fanotify_event *event) { return container_of(event, struct fanotify_path_event, fae); } /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the * information is retrieved by userspace the structure is moved from * group->notification_list to group->fanotify_data.access_list to wait for * user response. */ struct fanotify_perm_event { struct fanotify_event fae; struct path path; u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ union { struct fanotify_response_info_header hdr; struct fanotify_response_info_audit_rule audit_rule; }; }; static inline struct fanotify_perm_event * FANOTIFY_PERM(struct fanotify_event *event) { return container_of(event, struct fanotify_perm_event, fae); } static inline bool fanotify_is_perm_event(u32 mask) { return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) && mask & FANOTIFY_PERM_EVENTS; } static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) { return container_of(fse, struct fanotify_event, fse); } static inline bool fanotify_is_error_event(u32 mask) { return mask & FAN_FS_ERROR; } static inline const struct path *fanotify_event_path(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_PATH) return &FANOTIFY_PE(event)->path; else if (event->type == FANOTIFY_EVENT_TYPE_PATH_PERM) return &FANOTIFY_PERM(event)->path; else return NULL; } /* * Use 128 size hash table to speed up events merge. */ #define FANOTIFY_HTABLE_BITS (7) #define FANOTIFY_HTABLE_SIZE (1 << FANOTIFY_HTABLE_BITS) #define FANOTIFY_HTABLE_MASK (FANOTIFY_HTABLE_SIZE - 1) /* * Permission events and overflow event do not get merged - don't hash them. */ static inline bool fanotify_is_hashed_event(u32 mask) { return !(fanotify_is_perm_event(mask) || fsnotify_is_overflow_event(mask)); } static inline unsigned int fanotify_event_hash_bucket( struct fsnotify_group *group, struct fanotify_event *event) { return event->hash & FANOTIFY_HTABLE_MASK; } static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark) { unsigned int mflags = 0; if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF) mflags |= FAN_MARK_EVICTABLE; if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) mflags |= FAN_MARK_IGNORE; return mflags; }
1524 1 1 8211 8163 8179 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0 #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently * running in the kernel or userspace. * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a * reschedule IPI to force the targeted task to reschedule and run task_work. * This can be advantageous if there's no strict requirement that the * task_work be run as soon as possible, just whenever the task enters the * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism * in that case. * * Note: there is no ordering guarantee on works queued here. The task_work * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); head = READ_ONCE(task->task_works); do { if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: break; case TWA_RESUME: set_notify_resume(task); break; case TWA_SIGNAL: set_notify_signal(task); break; case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; default: WARN_ON_ONCE(1); break; } return 0; } /** * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data) { struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; if (likely(!task_work_pending(task))) return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the * new entry before this work, we will find it again. Or * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); work = READ_ONCE(*pprev); while (work) { if (!match(work, data)) { pprev = &work->next; work = READ_ONCE(*pprev); } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); return work; } static bool task_work_func_match(struct callback_head *cb, void *data) { return cb->func == data; } /** * task_work_cancel - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @func: identifies the work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } /** * task_work_run - execute the works added by task_work_add() * * Flush the pending works. Should be used by the core kernel code. * Called before the task returns to the user-mode or stops, or when * it exits. In the latter case task_work_add() can no longer add the * new work after task_work_run() returns. */ void task_work_run(void) { struct task_struct *task = current; struct callback_head *work, *head, *next; for (;;) { /* * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ work = READ_ONCE(task->task_works); do { head = NULL; if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; /* * Synchronize with task_work_cancel(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ raw_spin_lock_irq(&task->pi_lock); raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; work->func(work); work = next; cond_resched(); } while (work); } }
109 109 109 109 108 27 27 55 55 2 29 29 49 49 55 2 7 90 94 7 90 109 107 107 7 25 25 83 42 88 87 83 2 2 27 27 27 27 109 22 29 109 109 109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfs/bnode.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Handle basic btree node operations */ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/swap.h> #include "btree.h" void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len) { struct page *page; int pagenum; int bytes_read; int bytes_to_read; off += node->page_offset; pagenum = off >> PAGE_SHIFT; off &= ~PAGE_MASK; /* compute page offset for the first page */ for (bytes_read = 0; bytes_read < len; bytes_read += bytes_to_read) { if (pagenum >= node->tree->pages_per_bnode) break; page = node->page[pagenum]; bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off); memcpy_from_page(buf + bytes_read, page, off, bytes_to_read); pagenum++; off = 0; /* page offset only applies to the first page */ } } u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off) { __be16 data; // optimize later... hfs_bnode_read(node, &data, off, 2); return be16_to_cpu(data); } u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off) { u8 data; // optimize later... hfs_bnode_read(node, &data, off, 1); return data; } void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) { struct hfs_btree *tree; int key_len; tree = node->tree; if (node->type == HFS_NODE_LEAF || tree->attributes & HFS_TREE_VARIDXKEYS) key_len = hfs_bnode_read_u8(node, off) + 1; else key_len = tree->max_key_len + 1; hfs_bnode_read(node, key, off, key_len); } void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len) { struct page *page; off += node->page_offset; page = node->page[0]; memcpy_to_page(page, off, buf, len); set_page_dirty(page); } void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data) { __be16 v = cpu_to_be16(data); // optimize later... hfs_bnode_write(node, &v, off, 2); } void hfs_bnode_write_u8(struct hfs_bnode *node, int off, u8 data) { // optimize later... hfs_bnode_write(node, &data, off, 1); } void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) { struct page *page; off += node->page_offset; page = node->page[0]; memzero_page(page, off, len); set_page_dirty(page); } void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { struct page *src_page, *dst_page; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page[0]; dst_page = dst_node->page[0]; memcpy_page(dst_page, dst, src_page, src, len); set_page_dirty(dst_page); } void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) { struct page *page; void *ptr; hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); if (!len) return; src += node->page_offset; dst += node->page_offset; page = node->page[0]; ptr = kmap_local_page(page); memmove(ptr + dst, ptr + src, len); kunmap_local(ptr); set_page_dirty(page); } void hfs_bnode_dump(struct hfs_bnode *node) { struct hfs_bnode_desc desc; __be32 cnid; int i, off, key_off; hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); hfs_dbg_cont(BNODE_MOD, " %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; if (node->tree->attributes & HFS_TREE_VARIDXKEYS) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; hfs_dbg_cont(BNODE_MOD, " (%d,%d", tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); } } hfs_dbg_cont(BNODE_MOD, "\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) { struct hfs_btree *tree; struct hfs_bnode *tmp; __be32 cnid; tree = node->tree; if (node->prev) { tmp = hfs_bnode_find(tree, node->prev); if (IS_ERR(tmp)) return; tmp->next = node->next; cnid = cpu_to_be32(tmp->next); hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, next), 4); hfs_bnode_put(tmp); } else if (node->type == HFS_NODE_LEAF) tree->leaf_head = node->next; if (node->next) { tmp = hfs_bnode_find(tree, node->next); if (IS_ERR(tmp)) return; tmp->prev = node->prev; cnid = cpu_to_be32(tmp->prev); hfs_bnode_write(tmp, &cnid, offsetof(struct hfs_bnode_desc, prev), 4); hfs_bnode_put(tmp); } else if (node->type == HFS_NODE_LEAF) tree->leaf_tail = node->prev; // move down? if (!node->prev && !node->next) { printk(KERN_DEBUG "hfs_btree_del_level\n"); } if (!node->parent) { tree->root = 0; tree->depth = 0; } set_bit(HFS_BNODE_DELETED, &node->flags); } static inline int hfs_bnode_hash(u32 num) { num = (num >> 16) + num; num += num >> 8; return num & (NODE_HASH_SIZE - 1); } struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) { struct hfs_bnode *node; if (cnid >= tree->node_count) { pr_err("request for non-existent node %d in B*Tree\n", cnid); return NULL; } for (node = tree->node_hash[hfs_bnode_hash(cnid)]; node; node = node->next_hash) { if (node->this == cnid) { return node; } } return NULL; } static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; int size, block, i, hash; loff_t off; if (cnid >= tree->node_count) { pr_err("request for non-existent node %d in B*Tree\n", cnid); return NULL; } size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); if (!node) return NULL; node->tree = tree; node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); node2 = hfs_bnode_findhash(tree, cnid); if (!node2) { hash = hfs_bnode_hash(cnid); node->next_hash = tree->node_hash[hash]; tree->node_hash[hash] = node; tree->node_hash_cnt++; } else { hfs_bnode_get(node2); spin_unlock(&tree->hash_lock); kfree(node); wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags)); return node2; } spin_unlock(&tree->hash_lock); mapping = tree->inode->i_mapping; off = (loff_t)cnid * tree->node_size; block = off >> PAGE_SHIFT; node->page_offset = off & ~PAGE_MASK; for (i = 0; i < tree->pages_per_bnode; i++) { page = read_mapping_page(mapping, block++, NULL); if (IS_ERR(page)) goto fail; node->page[i] = page; } return node; fail: set_bit(HFS_BNODE_ERROR, &node->flags); return node; } void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) ; BUG_ON(!*p); *p = node->next_hash; node->tree->node_hash_cnt--; } /* Load a particular node out of a tree */ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) { struct hfs_bnode *node; struct hfs_bnode_desc *desc; int i, rec_off, off, next_off; int entry_size, key_size; spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, num); if (node) { hfs_bnode_get(node); spin_unlock(&tree->hash_lock); wait_event(node->lock_wq, !test_bit(HFS_BNODE_NEW, &node->flags)); if (test_bit(HFS_BNODE_ERROR, &node->flags)) goto node_error; return node; } spin_unlock(&tree->hash_lock); node = __hfs_bnode_create(tree, num); if (!node) return ERR_PTR(-ENOMEM); if (test_bit(HFS_BNODE_ERROR, &node->flags)) goto node_error; if (!test_bit(HFS_BNODE_NEW, &node->flags)) return node; desc = (struct hfs_bnode_desc *)(kmap_local_page(node->page[0]) + node->page_offset); node->prev = be32_to_cpu(desc->prev); node->next = be32_to_cpu(desc->next); node->num_recs = be16_to_cpu(desc->num_recs); node->type = desc->type; node->height = desc->height; kunmap_local(desc); switch (node->type) { case HFS_NODE_HEADER: case HFS_NODE_MAP: if (node->height != 0) goto node_error; break; case HFS_NODE_LEAF: if (node->height != 1) goto node_error; break; case HFS_NODE_INDEX: if (node->height <= 1 || node->height > tree->depth) goto node_error; break; default: goto node_error; } rec_off = tree->node_size - 2; off = hfs_bnode_read_u16(node, rec_off); if (off != sizeof(struct hfs_bnode_desc)) goto node_error; for (i = 1; i <= node->num_recs; off = next_off, i++) { rec_off -= 2; next_off = hfs_bnode_read_u16(node, rec_off); if (next_off <= off || next_off > tree->node_size || next_off & 1) goto node_error; entry_size = next_off - off; if (node->type != HFS_NODE_INDEX && node->type != HFS_NODE_LEAF) continue; key_size = hfs_bnode_read_u8(node, off) + 1; if (key_size >= entry_size /*|| key_size & 1*/) goto node_error; } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); return node; node_error: set_bit(HFS_BNODE_ERROR, &node->flags); clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); hfs_bnode_put(node); return ERR_PTR(-EIO); } void hfs_bnode_free(struct hfs_bnode *node) { int i; for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) put_page(node->page[i]); kfree(node); } struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) { struct hfs_bnode *node; struct page **pagep; int i; spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, num); spin_unlock(&tree->hash_lock); if (node) { pr_crit("new node %u already hashed?\n", num); WARN_ON(1); return node; } node = __hfs_bnode_create(tree, num); if (!node) return ERR_PTR(-ENOMEM); if (test_bit(HFS_BNODE_ERROR, &node->flags)) { hfs_bnode_put(node); return ERR_PTR(-EIO); } pagep = node->page; memzero_page(*pagep, node->page_offset, min((int)PAGE_SIZE, (int)tree->node_size)); set_page_dirty(*pagep); for (i = 1; i < tree->pages_per_bnode; i++) { memzero_page(*++pagep, 0, PAGE_SIZE); set_page_dirty(*pagep); } clear_bit(HFS_BNODE_NEW, &node->flags); wake_up(&node->lock_wq); return node; } void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } } /* Dispose of resources used by a node */ void hfs_bnode_put(struct hfs_bnode *node) { if (node) { struct hfs_btree *tree = node->tree; int i; hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) return; for (i = 0; i < tree->pages_per_bnode; i++) { if (!node->page[i]) continue; mark_page_accessed(node->page[i]); } if (test_bit(HFS_BNODE_DELETED, &node->flags)) { hfs_bnode_unhash(node); spin_unlock(&tree->hash_lock); hfs_bmap_free(node); hfs_bnode_free(node); return; } spin_unlock(&tree->hash_lock); } }
32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the UDP protocol. * * Version: @(#)udp.h 1.0.2 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_UDP_H #define _LINUX_UDP_H #include <net/inet_sock.h> #include <linux/skbuff.h> #include <net/netns/hash.h> #include <uapi/linux/udp.h> static inline struct udphdr *udp_hdr(const struct sk_buff *skb) { return (struct udphdr *)skb_transport_header(skb); } #define UDP_HTABLE_SIZE_MIN_PERNET 128 #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) #define UDP_HTABLE_SIZE_MAX 65536 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) { return (num + net_hash_mix(net)) & mask; } enum { UDP_FLAGS_CORK, /* Cork is required */ UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */ UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */ UDP_FLAGS_ACCEPT_FRAGLIST, UDP_FLAGS_ACCEPT_L4, UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */ UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */ UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ }; struct udp_sock { /* inet_sock has to be the first member */ struct inet_sock inet; #define udp_port_hash inet.sk.__sk_common.skc_u16hashes[0] #define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1] #define udp_portaddr_node inet.sk.__sk_common.skc_portaddr_node unsigned long udp_flags; int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ /* * Following member retains the information to create a UDP header * when the socket is uncorked. */ __u16 len; /* total length of pending frames */ __u16 gso_size; /* * Fields specific to UDP-Lite. */ __u16 pcslen; __u16 pcrlen; /* * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); /* GRO functions for UDP socket */ struct sk_buff * (*gro_receive)(struct sock *sk, struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sock *sk, struct sk_buff *skb, int nhoff); /* udp_recvmsg try to use this before splicing sk_receive_queue */ struct sk_buff_head reader_queue ____cacheline_aligned_in_smp; /* This field is dirtied by udp_recvmsg() */ int forward_deficit; /* This fields follows rcvbuf value, and is touched by udp_recvmsg */ int forward_threshold; }; #define udp_test_bit(nr, sk) \ test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) #define udp_set_bit(nr, sk) \ set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) #define udp_test_and_set_bit(nr, sk) \ test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) #define udp_clear_bit(nr, sk) \ clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) #define udp_assign_bit(nr, sk, val) \ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) #define UDP_MAX_SEGMENTS (1 << 6UL) #define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) static inline void udp_set_no_check6_tx(struct sock *sk, bool val) { udp_assign_bit(NO_CHECK6_TX, sk, val); } static inline void udp_set_no_check6_rx(struct sock *sk, bool val) { udp_assign_bit(NO_CHECK6_RX, sk, val); } static inline bool udp_get_no_check6_tx(const struct sock *sk) { return udp_test_bit(NO_CHECK6_TX, sk); } static inline bool udp_get_no_check6_rx(const struct sock *sk) { return udp_test_bit(NO_CHECK6_RX, sk); } static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { int gso_size; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) { gso_size = skb_shinfo(skb)->gso_size; put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size); } } static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) { if (!skb_is_gso(skb)) return false; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_test_bit(ACCEPT_L4, sk)) return true; if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_test_bit(ACCEPT_FRAGLIST, sk)) return true; return false; } static inline void udp_allow_gso(struct sock *sk) { udp_set_bit(ACCEPT_L4, sk); udp_set_bit(ACCEPT_FRAGLIST, sk); } #define udp_portaddr_for_each_entry(__sk, list) \ hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node) #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) #endif /* _LINUX_UDP_H */
4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/string.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/caif/cfpkt.h> #define PKT_PREFIX 48 #define PKT_POSTFIX 2 #define PKT_LEN_WHEN_EXTENDING 128 #define PKT_ERROR(pkt, errmsg) \ do { \ cfpkt_priv(pkt)->erronous = true; \ skb_reset_tail_pointer(&pkt->skb); \ pr_warn(errmsg); \ } while (0) struct cfpktq { struct sk_buff_head head; atomic_t count; /* Lock protects count updates */ spinlock_t lock; }; /* * net/caif/ is generic and does not * understand SKB, so we do this typecast */ struct cfpkt { struct sk_buff skb; }; /* Private data inside SKB */ struct cfpkt_priv_data { struct dev_info dev_info; bool erronous; }; static inline struct cfpkt_priv_data *cfpkt_priv(struct cfpkt *pkt) { return (struct cfpkt_priv_data *) pkt->skb.cb; } static inline bool is_erronous(struct cfpkt *pkt) { return cfpkt_priv(pkt)->erronous; } static inline struct sk_buff *pkt_to_skb(struct cfpkt *pkt) { return &pkt->skb; } static inline struct cfpkt *skb_to_pkt(struct sk_buff *skb) { return (struct cfpkt *) skb; } struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt) { struct cfpkt *pkt = skb_to_pkt(nativepkt); cfpkt_priv(pkt)->erronous = false; return pkt; } EXPORT_SYMBOL(cfpkt_fromnative); void *cfpkt_tonative(struct cfpkt *pkt) { return (void *) pkt; } EXPORT_SYMBOL(cfpkt_tonative); static struct cfpkt *cfpkt_create_pfx(u16 len, u16 pfx) { struct sk_buff *skb; skb = alloc_skb(len + pfx, GFP_ATOMIC); if (unlikely(skb == NULL)) return NULL; skb_reserve(skb, pfx); return skb_to_pkt(skb); } inline struct cfpkt *cfpkt_create(u16 len) { return cfpkt_create_pfx(len + PKT_POSTFIX, PKT_PREFIX); } void cfpkt_destroy(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); kfree_skb(skb); } inline bool cfpkt_more(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len > 0; } int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); if (skb_headlen(skb) >= len) { memcpy(data, skb->data, len); return 0; } return !cfpkt_extr_head(pkt, data, len) && !cfpkt_add_head(pkt, data, len); } int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); u8 *from; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(len > skb->len)) { PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } if (unlikely(len > skb_headlen(skb))) { if (unlikely(skb_linearize(skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } } from = skb_pull(skb, len); from -= len; if (data) memcpy(data, from, len); return 0; } EXPORT_SYMBOL(cfpkt_extr_head); int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); u8 *data = dta; u8 *from; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } if (unlikely(skb->data + len > skb_tail_pointer(skb))) { PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } from = skb_tail_pointer(skb) - len; skb_trim(skb, skb->len - len); memcpy(data, from, len); return 0; } int cfpkt_pad_trail(struct cfpkt *pkt, u16 len) { return cfpkt_add_body(pkt, NULL, len); } int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); struct sk_buff *lastskb; u8 *to; u16 addlen = 0; if (unlikely(is_erronous(pkt))) return -EPROTO; lastskb = skb; /* Check whether we need to add space at the tail */ if (unlikely(skb_tailroom(skb) < len)) { if (likely(len < PKT_LEN_WHEN_EXTENDING)) addlen = PKT_LEN_WHEN_EXTENDING; else addlen = len; } /* Check whether we need to change the SKB before writing to the tail */ if (unlikely((addlen > 0) || skb_cloned(skb) || skb_shared(skb))) { /* Make sure data is writable */ if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) { PKT_ERROR(pkt, "cow failed\n"); return -EPROTO; } } /* All set to put the last SKB and optionally write data there. */ to = pskb_put(skb, lastskb, len); if (likely(data)) memcpy(to, data, len); return 0; } inline int cfpkt_addbdy(struct cfpkt *pkt, u8 data) { return cfpkt_add_body(pkt, &data, 1); } int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); struct sk_buff *lastskb; u8 *to; const u8 *data = data2; int ret; if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_headroom(skb) < len)) { PKT_ERROR(pkt, "no headroom\n"); return -EPROTO; } /* Make sure data is writable */ ret = skb_cow_data(skb, 0, &lastskb); if (unlikely(ret < 0)) { PKT_ERROR(pkt, "cow failed\n"); return ret; } to = skb_push(skb, len); memcpy(to, data, len); return 0; } EXPORT_SYMBOL(cfpkt_add_head); inline int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len) { return cfpkt_add_body(pkt, data, len); } inline u16 cfpkt_getlen(struct cfpkt *pkt) { struct sk_buff *skb = pkt_to_skb(pkt); return skb->len; } int cfpkt_iterate(struct cfpkt *pkt, u16 (*iter_func)(u16, void *, u16), u16 data) { /* * Don't care about the performance hit of linearizing, * Checksum should not be used on high-speed interfaces anyway. */ if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(&pkt->skb) != 0)) { PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); } int cfpkt_setlen(struct cfpkt *pkt, u16 len) { struct sk_buff *skb = pkt_to_skb(pkt); if (unlikely(is_erronous(pkt))) return -EPROTO; if (likely(len <= skb->len)) { if (unlikely(skb->data_len)) ___pskb_trim(skb, len); else skb_trim(skb, len); return cfpkt_getlen(pkt); } /* Need to expand SKB */ if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len))) PKT_ERROR(pkt, "skb_pad_trail failed\n"); return cfpkt_getlen(pkt); } struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, struct cfpkt *addpkt, u16 expectlen) { struct sk_buff *dst = pkt_to_skb(dstpkt); struct sk_buff *add = pkt_to_skb(addpkt); u16 addlen = skb_headlen(add); u16 neededtailspace; struct sk_buff *tmp; u16 dstlen; u16 createlen; if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) { return dstpkt; } if (expectlen > addlen) neededtailspace = expectlen; else neededtailspace = addlen; if (dst->tail + neededtailspace > dst->end) { /* Create a dumplicate of 'dst' with more tail space */ struct cfpkt *tmppkt; dstlen = skb_headlen(dst); createlen = dstlen + neededtailspace; tmppkt = cfpkt_create(createlen + PKT_PREFIX + PKT_POSTFIX); if (tmppkt == NULL) return NULL; tmp = pkt_to_skb(tmppkt); skb_put_data(tmp, dst->data, dstlen); cfpkt_destroy(dstpkt); dst = tmp; } skb_put_data(dst, add->data, skb_headlen(add)); cfpkt_destroy(addpkt); return skb_to_pkt(dst); } struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos) { struct sk_buff *skb2; struct sk_buff *skb = pkt_to_skb(pkt); struct cfpkt *tmppkt; u8 *split = skb->data + pos; u16 len2nd = skb_tail_pointer(skb) - split; if (unlikely(is_erronous(pkt))) return NULL; if (skb->data + pos > skb_tail_pointer(skb)) { PKT_ERROR(pkt, "trying to split beyond end of packet\n"); return NULL; } /* Create a new packet for the second part of the data */ tmppkt = cfpkt_create_pfx(len2nd + PKT_PREFIX + PKT_POSTFIX, PKT_PREFIX); if (tmppkt == NULL) return NULL; skb2 = pkt_to_skb(tmppkt); if (skb2 == NULL) return NULL; skb_put_data(skb2, split, len2nd); /* Reduce the length of the original packet */ skb_trim(skb, pos); skb2->priority = skb->priority; return skb_to_pkt(skb2); } bool cfpkt_erroneous(struct cfpkt *pkt) { return cfpkt_priv(pkt)->erronous; } struct caif_payload_info *cfpkt_info(struct cfpkt *pkt) { return (struct caif_payload_info *)&pkt_to_skb(pkt)->cb; } EXPORT_SYMBOL(cfpkt_info); void cfpkt_set_prio(struct cfpkt *pkt, int prio) { pkt_to_skb(pkt)->priority = prio; } EXPORT_SYMBOL(cfpkt_set_prio);
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Fallback per-CPU frame pointer holder * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _ASM_GENERIC_IRQ_REGS_H #define _ASM_GENERIC_IRQ_REGS_H #include <linux/percpu.h> /* * Per-cpu current frame pointer - the location of the last exception frame on * the stack */ DECLARE_PER_CPU(struct pt_regs *, __irq_regs); static inline struct pt_regs *get_irq_regs(void) { return __this_cpu_read(__irq_regs); } static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) { struct pt_regs *old_regs; old_regs = __this_cpu_read(__irq_regs); __this_cpu_write(__irq_regs, new_regs); return old_regs; } #endif /* _ASM_GENERIC_IRQ_REGS_H */
40 39 19 15 5 17 17 1 19 1 18 18 18 17 17 1 3 1 1 1 1 6 12 18 18 17 1303 15 5 18 1 19 1360 1364 1364 17 40 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King * Copyright (C) 2020 Christoph Hellwig */ #include <linux/fs.h> #include <linux/major.h> #include <linux/slab.h> #include <linux/ctype.h> #include <linux/vmalloc.h> #include <linux/raid/detect.h> #include "check.h" static int (*const check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. */ #ifdef CONFIG_ACORN_PARTITION_ICS adfspart_check_ICS, #endif #ifdef CONFIG_ACORN_PARTITION_POWERTEC adfspart_check_POWERTEC, #endif #ifdef CONFIG_ACORN_PARTITION_EESOX adfspart_check_EESOX, #endif /* * Now move on to formats that only have partition info at * disk address 0xdc0. Since these may also have stale * PC/BIOS partition tables, they need to come before * the msdos entry. */ #ifdef CONFIG_ACORN_PARTITION_CUMANA adfspart_check_CUMANA, #endif #ifdef CONFIG_ACORN_PARTITION_ADFS adfspart_check_ADFS, #endif #ifdef CONFIG_CMDLINE_PARTITION cmdline_partition, #endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif #ifdef CONFIG_SGI_PARTITION sgi_partition, #endif #ifdef CONFIG_LDM_PARTITION ldm_partition, /* this must come before msdos */ #endif #ifdef CONFIG_MSDOS_PARTITION msdos_partition, #endif #ifdef CONFIG_OSF_PARTITION osf_partition, #endif #ifdef CONFIG_SUN_PARTITION sun_partition, #endif #ifdef CONFIG_AMIGA_PARTITION amiga_partition, #endif #ifdef CONFIG_ATARI_PARTITION atari_partition, #endif #ifdef CONFIG_MAC_PARTITION mac_partition, #endif #ifdef CONFIG_ULTRIX_PARTITION ultrix_partition, #endif #ifdef CONFIG_IBM_PARTITION ibm_partition, #endif #ifdef CONFIG_KARMA_PARTITION karma_partition, #endif #ifdef CONFIG_SYSV68_PARTITION sysv68_partition, #endif NULL }; static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); return NULL; } state->limit = nr; return state; } static void free_partitions(struct parsed_partitions *state) { vfree(state->parts); kfree(state); } static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; state = allocate_partitions(hd); if (!state) return NULL; state->pp_buf = (char *)__get_free_page(GFP_KERNEL); if (!state->pp_buf) { free_partitions(state); return NULL; } state->pp_buf[0] = '\0'; state->disk = hd; snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); i = res = err = 0; while (!res && check_part[i]) { memset(state->parts, 0, state->limit * sizeof(state->parts[0])); res = check_part[i++](state); if (res < 0) { /* * We have hit an I/O error which we don't report now. * But record it, and let the others do their job. */ err = res; res = 0; } } if (res > 0) { printk(KERN_INFO "%s", state->pp_buf); free_page((unsigned long)state->pp_buf); return state; } if (state->access_beyond_eod) err = -ENOSPC; /* * The partition is unrecognized. So report I/O errors if there were any */ if (err) res = err; if (res) { strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE); printk(KERN_INFO "%s", state->pp_buf); } free_page((unsigned long)state->pp_buf); free_partitions(state); return ERR_PTR(res); } static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); static DEVICE_ATTR(start, 0444, part_start_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_ro.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif NULL }; static const struct attribute_group part_attr_group = { .attrs = part_attrs, }; static const struct attribute_group *part_attr_groups[] = { &part_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif NULL }; static void part_release(struct device *dev) { put_disk(dev_to_bdev(dev)->bd_disk); iput(dev_to_bdev(dev)->bd_inode); } static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct block_device *part = dev_to_bdev(dev); add_uevent_var(env, "PARTN=%u", part->bd_partno); if (part->bd_meta_info && part->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); return 0; } const struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, .uevent = part_uevent, }; void drop_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); xa_erase(&part->bd_disk->part_tbl, part->bd_partno); kobject_put(part->bd_holder_dir); device_del(&part->bd_device); put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, struct device_attribute *attr, char *buf) { return 0; } static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; struct block_device *bdev; const char *dname; int err; lockdep_assert_held(&disk->open_mutex); if (partno >= DISK_MAX_PARTS) return ERR_PTR(-EINVAL); /* * Partitions are not supported on zoned block devices that are used as * such. */ switch (disk->queue->limits.zoned) { case BLK_ZONED_HM: pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); case BLK_ZONED_HA: pr_info("%s: disabling host aware zoned block device support due to partitions\n", disk->disk_name); disk_set_zoned(disk, BLK_ZONED_NONE); break; case BLK_ZONED_NONE: break; } if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); /* ensure we always have a reference to the whole disk */ get_device(disk_to_dev(disk)); err = -ENOMEM; bdev = bdev_alloc(disk, partno); if (!bdev) goto out_put_disk; bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); else dev_set_name(pdev, "%s%d", dname, partno); device_initialize(pdev); pdev->class = &block_class; pdev->type = &part_type; pdev->parent = ddev; /* in consecutive minor range? */ if (bdev->bd_partno < disk->minors) { devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); } else { err = blk_alloc_ext_minor(); if (err < 0) goto out_put; devt = MKDEV(BLOCK_EXT_MAJOR, err); } pdev->devt = devt; if (info) { err = -ENOMEM; bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); if (!bdev->bd_meta_info) goto out_put; } /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); if (err) goto out_put; err = -ENOMEM; bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); if (flags & ADDPART_FLAG_WHOLEDISK) { err = device_create_file(pdev, &dev_attr_whole_disk); if (err) goto out_del; } /* everything is up and running, commence */ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); if (err) goto out_del; bdev_add(bdev, devt); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); return bdev; out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); return ERR_PTR(err); out_put_disk: put_disk(disk); return ERR_PTR(err); } static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { struct block_device *part; bool overlap = false; unsigned long idx; rcu_read_lock(); xa_for_each_start(&disk->part_tbl, idx, part, 1) { if (part->bd_partno != skip_partno && start < part->bd_start_sect + bdev_nr_sectors(part) && start + length > part->bd_start_sect) { overlap = true; break; } } rcu_read_unlock(); return overlap; } int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { sector_t capacity = get_capacity(disk), end; struct block_device *part; int ret; mutex_lock(&disk->open_mutex); if (check_add_overflow(start, length, &end)) { ret = -EINVAL; goto out; } if (start >= capacity || end > capacity) { ret = -EINVAL; goto out; } if (!disk_live(disk)) { ret = -ENXIO; goto out; } if (partition_overlaps(disk, start, length, -1)) { ret = -EBUSY; goto out; } part = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE, NULL); ret = PTR_ERR_OR_ZERO(part); out: mutex_unlock(&disk->open_mutex); return ret; } int bdev_del_partition(struct gendisk *disk, int partno) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EBUSY; if (atomic_read(&part->bd_openers)) goto out_unlock; /* * We verified that @part->bd_openers is zero above and so * @part->bd_holder{_ops} can't be set. And since we hold * @disk->open_mutex the device can't be claimed by anyone. * * So no need to call @part->bd_holder_ops->mark_dead() here. * Just delete the partition and invalidate it. */ remove_inode_hash(part->bd_inode); invalidate_bdev(part); drop_partition(part); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { struct block_device *part = NULL; int ret = -ENXIO; mutex_lock(&disk->open_mutex); part = xa_load(&disk->part_tbl, partno); if (!part) goto out_unlock; ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(disk, start, length, partno)) goto out_unlock; bdev_set_nr_sectors(part, length); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); return ret; } static bool disk_unlock_native_capacity(struct gendisk *disk) { if (!disk->fops->unlock_native_capacity || test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } printk(KERN_CONT "enabling native capacity\n"); disk->fops->unlock_native_capacity(disk); return true; } static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; struct block_device *part; if (!size) return true; if (from >= get_capacity(disk)) { printk(KERN_WARNING "%s: p%d start %llu is beyond EOD, ", disk->disk_name, p, (unsigned long long) from); if (disk_unlock_native_capacity(disk)) return false; return true; } if (from + size > get_capacity(disk)) { printk(KERN_WARNING "%s: p%d size %llu extends beyond EOD, ", disk->disk_name, p, (unsigned long long) size); if (disk_unlock_native_capacity(disk)) return false; /* * We can not ignore partitions of broken tables created by for * example camera firmware, but we limit them to the end of the * disk to avoid creating invalid block devices. */ size = get_capacity(disk) - from; } part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { printk(KERN_ERR " %s: p%d could not be added: %ld\n", disk->disk_name, p, -PTR_ERR(part)); return true; } if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) md_autodetect_dev(part->bd_dev); return true; } static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; int ret = -EAGAIN, p; if (disk->flags & GENHD_FL_NO_PART) return 0; if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) return 0; state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { /* * I/O error reading the partition table. If we tried to read * beyond EOD, retry after unlocking the native capacity. */ if (PTR_ERR(state) == -ENOSPC) { printk(KERN_WARNING "%s: partition table beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) return -EAGAIN; } return -EIO; } /* * Partitions are not supported on host managed zoned block devices. */ if (disk->queue->limits.zoned == BLK_ZONED_HM) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; goto out_free_state; } /* * If we read beyond EOD, try unlocking native capacity even if the * partition table was successfully read as we could be missing some * partitions. */ if (state->access_beyond_eod) { printk(KERN_WARNING "%s: partition table partially beyond EOD, ", disk->disk_name); if (disk_unlock_native_capacity(disk)) goto out_free_state; } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); for (p = 1; p < state->limit; p++) if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; out_free_state: free_partitions(state); return ret; } int bdev_disk_changed(struct gendisk *disk, bool invalidate) { struct block_device *part; unsigned long idx; int ret = 0; lockdep_assert_held(&disk->open_mutex); if (!disk_live(disk)) return -ENXIO; rescan: if (disk->open_partitions) return -EBUSY; sync_blockdev(disk->part0); invalidate_bdev(disk->part0); xa_for_each_start(&disk->part_tbl, idx, part, 1) { /* * Remove the block device from the inode hash, so that * it cannot be looked up any more even when openers * still hold references. */ remove_inode_hash(part->bd_inode); /* * If @disk->open_partitions isn't elevated but there's * still an active holder of that block device things * are broken. */ WARN_ON_ONCE(atomic_read(&part->bd_openers)); invalidate_bdev(part); drop_partition(part); } clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). * Doing that is rather inconsistent, but changing it broke legacy * udisks polling for legacy ide-cdrom devices. Use the crude check * below to get the sane behavior for most device while not breaking * userspace for this particular setup. */ if (invalidate) { if (!(disk->flags & GENHD_FL_NO_PART) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } if (get_capacity(disk)) { ret = blk_add_partitions(disk); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); } return ret; } /* * Only exported for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; struct folio *folio; if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; goto out; } folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL); if (IS_ERR(folio)) goto out; p->v = folio; return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE); out: p->v = NULL; return NULL; }
7 3 2 7 6 7 2 7 6 2 7 1 1 2 5 5 2 1 2 2 1 1 2 2 1 3 7 4 6 1 3 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 // SPDX-License-Identifier: GPL-2.0-only /* * H-TCP congestion control. The algorithm is detailed in: * R.N.Shorten, D.J.Leith: * "H-TCP: TCP for high-speed and long-distance networks" * Proc. PFLDnet, Argonne, 2004. * https://www.hamilton.ie/net/htcp3.pdf */ #include <linux/mm.h> #include <linux/module.h> #include <net/tcp.h> #define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ #define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ #define BETA_MAX 102 /* 0.8 with shift << 7 */ static int use_rtt_scaling __read_mostly = 1; module_param(use_rtt_scaling, int, 0644); MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); static int use_bandwidth_switch __read_mostly = 1; module_param(use_bandwidth_switch, int, 0644); MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); struct htcp { u32 alpha; /* Fixed point arith, << 7 */ u8 beta; /* Fixed point arith, << 7 */ u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ u16 pkts_acked; u32 packetcount; u32 minRTT; u32 maxRTT; u32 last_cong; /* Time since last congestion event end */ u32 undo_last_cong; u32 undo_maxRTT; u32 undo_old_maxB; /* Bandwidth estimation */ u32 minB; u32 maxB; u32 old_maxB; u32 Bi; u32 lasttime; }; static inline u32 htcp_cong_time(const struct htcp *ca) { return jiffies - ca->last_cong; } static inline u32 htcp_ccount(const struct htcp *ca) { return htcp_cong_time(ca) / ca->minRTT; } static inline void htcp_reset(struct htcp *ca) { ca->undo_last_cong = ca->last_cong; ca->undo_maxRTT = ca->maxRTT; ca->undo_old_maxB = ca->old_maxB; ca->last_cong = jiffies; } static u32 htcp_cwnd_undo(struct sock *sk) { struct htcp *ca = inet_csk_ca(sk); if (ca->undo_last_cong) { ca->last_cong = ca->undo_last_cong; ca->maxRTT = ca->undo_maxRTT; ca->old_maxB = ca->undo_old_maxB; ca->undo_last_cong = 0; } return tcp_reno_undo_cwnd(sk); } static inline void measure_rtt(struct sock *sk, u32 srtt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct htcp *ca = inet_csk_ca(sk); /* keep track of minimum RTT seen so far, minRTT is zero at first */ if (ca->minRTT > srtt || !ca->minRTT) ca->minRTT = srtt; /* max RTT */ if (icsk->icsk_ca_state == TCP_CA_Open) { if (ca->maxRTT < ca->minRTT) ca->maxRTT = ca->minRTT; if (ca->maxRTT < srtt && srtt <= ca->maxRTT + msecs_to_jiffies(20)) ca->maxRTT = srtt; } } static void measure_achieved_throughput(struct sock *sk, const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); u32 now = tcp_jiffies32; if (icsk->icsk_ca_state == TCP_CA_Open) ca->pkts_acked = sample->pkts_acked; if (sample->rtt_us > 0) measure_rtt(sk, usecs_to_jiffies(sample->rtt_us)); if (!use_bandwidth_switch) return; /* achieved throughput calculations */ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) { ca->packetcount = 0; ca->lasttime = now; return; } ca->packetcount += sample->pkts_acked; if (ca->packetcount >= tcp_snd_cwnd(tp) - (ca->alpha >> 7 ? : 1) && now - ca->lasttime >= ca->minRTT && ca->minRTT > 0) { __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); if (htcp_ccount(ca) <= 3) { /* just after backoff */ ca->minB = ca->maxB = ca->Bi = cur_Bi; } else { ca->Bi = (3 * ca->Bi + cur_Bi) / 4; if (ca->Bi > ca->maxB) ca->maxB = ca->Bi; if (ca->minB > ca->maxB) ca->minB = ca->maxB; } ca->packetcount = 0; ca->lasttime = now; } } static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) { if (use_bandwidth_switch) { u32 maxB = ca->maxB; u32 old_maxB = ca->old_maxB; ca->old_maxB = ca->maxB; if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { ca->beta = BETA_MIN; ca->modeswitch = 0; return; } } if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) { ca->beta = (minRTT << 7) / maxRTT; if (ca->beta < BETA_MIN) ca->beta = BETA_MIN; else if (ca->beta > BETA_MAX) ca->beta = BETA_MAX; } else { ca->beta = BETA_MIN; ca->modeswitch = 1; } } static inline void htcp_alpha_update(struct htcp *ca) { u32 minRTT = ca->minRTT; u32 factor = 1; u32 diff = htcp_cong_time(ca); if (diff > HZ) { diff -= HZ; factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ; } if (use_rtt_scaling && minRTT) { u32 scale = (HZ << 3) / (10 * minRTT); /* clamping ratio to interval [0.5,10]<<3 */ scale = min(max(scale, 1U << 2), 10U << 3); factor = (factor << 3) / scale; if (!factor) factor = 1; } ca->alpha = 2 * factor * ((1 << 7) - ca->beta); if (!ca->alpha) ca->alpha = ALPHA_BASE; } /* * After we have the rtt data to calculate beta, we'd still prefer to wait one * rtt before we adjust our beta to ensure we are working from a consistent * data. * * This function should be called when we hit a congestion event since only at * that point do we really have a real sense of maxRTT (the queues en route * were getting just too full now). */ static void htcp_param_update(struct sock *sk) { struct htcp *ca = inet_csk_ca(sk); u32 minRTT = ca->minRTT; u32 maxRTT = ca->maxRTT; htcp_beta_update(ca, minRTT, maxRTT); htcp_alpha_update(ca); /* add slowly fading memory for maxRTT to accommodate routing changes */ if (minRTT > 0 && maxRTT > minRTT) ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100; } static u32 htcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct htcp *ca = inet_csk_ca(sk); htcp_param_update(sk); return max((tcp_snd_cwnd(tp) * ca->beta) >> 7, 2U); } static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tcp_snd_cwnd(tp)) { if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tp->snd_cwnd_cnt = 0; htcp_alpha_update(ca); } else tp->snd_cwnd_cnt += ca->pkts_acked; ca->pkts_acked = 1; } } static void htcp_init(struct sock *sk) { struct htcp *ca = inet_csk_ca(sk); memset(ca, 0, sizeof(struct htcp)); ca->alpha = ALPHA_BASE; ca->beta = BETA_MIN; ca->pkts_acked = 1; ca->last_cong = jiffies; } static void htcp_state(struct sock *sk, u8 new_state) { switch (new_state) { case TCP_CA_Open: { struct htcp *ca = inet_csk_ca(sk); if (ca->undo_last_cong) { ca->last_cong = jiffies; ca->undo_last_cong = 0; } } break; case TCP_CA_CWR: case TCP_CA_Recovery: case TCP_CA_Loss: htcp_reset(inet_csk_ca(sk)); break; } } static struct tcp_congestion_ops htcp __read_mostly = { .init = htcp_init, .ssthresh = htcp_recalc_ssthresh, .cong_avoid = htcp_cong_avoid, .set_state = htcp_state, .undo_cwnd = htcp_cwnd_undo, .pkts_acked = measure_achieved_throughput, .owner = THIS_MODULE, .name = "htcp", }; static int __init htcp_register(void) { BUILD_BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE); BUILD_BUG_ON(BETA_MIN >= BETA_MAX); return tcp_register_congestion_control(&htcp); } static void __exit htcp_unregister(void) { tcp_unregister_congestion_control(&htcp); } module_init(htcp_register); module_exit(htcp_unregister); MODULE_AUTHOR("Baruch Even"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("H-TCP");
5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; BPDU handling * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/netfilter_bridge.h> #include <linux/etherdevice.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/stp.h> #include <asm/unaligned.h> #include "br_private.h" #include "br_private_stp.h" #define STP_HZ 256 #define LLC_RESERVE sizeof(struct llc_pdu_un) static int br_send_bpdu_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dev_queue_xmit(skb); } static void br_send_bpdu(struct net_bridge_port *p, const unsigned char *data, int length) { struct sk_buff *skb; skb = dev_alloc_skb(length+LLC_RESERVE); if (!skb) return; skb->dev = p->dev; skb->protocol = htons(ETH_P_802_2); skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, LLC_RESERVE); __skb_put_data(skb, data, length); llc_pdu_header_init(skb, LLC_PDU_TYPE_U, LLC_SAP_BSPAN, LLC_SAP_BSPAN, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); llc_mac_hdr_init(skb, p->dev->dev_addr, p->br->group_addr); skb_reset_mac_header(skb); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, dev_net(p->dev), NULL, skb, NULL, skb->dev, br_send_bpdu_finish); } static inline void br_set_ticks(unsigned char *dest, int j) { unsigned long ticks = (STP_HZ * j)/ HZ; put_unaligned_be16(ticks, dest); } static inline int br_get_ticks(const unsigned char *src) { unsigned long ticks = get_unaligned_be16(src); return DIV_ROUND_UP(ticks * HZ, STP_HZ); } /* called under bridge lock */ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) { unsigned char buf[35]; if (p->br->stp_enabled != BR_KERNEL_STP) return; buf[0] = 0; buf[1] = 0; buf[2] = 0; buf[3] = BPDU_TYPE_CONFIG; buf[4] = (bpdu->topology_change ? 0x01 : 0) | (bpdu->topology_change_ack ? 0x80 : 0); buf[5] = bpdu->root.prio[0]; buf[6] = bpdu->root.prio[1]; buf[7] = bpdu->root.addr[0]; buf[8] = bpdu->root.addr[1]; buf[9] = bpdu->root.addr[2]; buf[10] = bpdu->root.addr[3]; buf[11] = bpdu->root.addr[4]; buf[12] = bpdu->root.addr[5]; buf[13] = (bpdu->root_path_cost >> 24) & 0xFF; buf[14] = (bpdu->root_path_cost >> 16) & 0xFF; buf[15] = (bpdu->root_path_cost >> 8) & 0xFF; buf[16] = bpdu->root_path_cost & 0xFF; buf[17] = bpdu->bridge_id.prio[0]; buf[18] = bpdu->bridge_id.prio[1]; buf[19] = bpdu->bridge_id.addr[0]; buf[20] = bpdu->bridge_id.addr[1]; buf[21] = bpdu->bridge_id.addr[2]; buf[22] = bpdu->bridge_id.addr[3]; buf[23] = bpdu->bridge_id.addr[4]; buf[24] = bpdu->bridge_id.addr[5]; buf[25] = (bpdu->port_id >> 8) & 0xFF; buf[26] = bpdu->port_id & 0xFF; br_set_ticks(buf+27, bpdu->message_age); br_set_ticks(buf+29, bpdu->max_age); br_set_ticks(buf+31, bpdu->hello_time); br_set_ticks(buf+33, bpdu->forward_delay); br_send_bpdu(p, buf, 35); p->stp_xstats.tx_bpdu++; } /* called under bridge lock */ void br_send_tcn_bpdu(struct net_bridge_port *p) { unsigned char buf[4]; if (p->br->stp_enabled != BR_KERNEL_STP) return; buf[0] = 0; buf[1] = 0; buf[2] = 0; buf[3] = BPDU_TYPE_TCN; br_send_bpdu(p, buf, 4); p->stp_xstats.tx_tcn++; } /* * Called from llc. * * NO locks, but rcu_read_lock */ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, struct net_device *dev) { struct net_bridge_port *p; struct net_bridge *br; const unsigned char *buf; if (!pskb_may_pull(skb, 4)) goto err; /* compare of protocol id and version */ buf = skb->data; if (buf[0] != 0 || buf[1] != 0 || buf[2] != 0) goto err; p = br_port_get_check_rcu(dev); if (!p) goto err; br = p->br; spin_lock(&br->lock); if (br->stp_enabled != BR_KERNEL_STP) goto out; if (!(br->dev->flags & IFF_UP)) goto out; if (p->state == BR_STATE_DISABLED) goto out; if (!ether_addr_equal(eth_hdr(skb)->h_dest, br->group_addr)) goto out; if (p->flags & BR_BPDU_GUARD) { br_notice(br, "BPDU received on blocked port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); br_stp_disable_port(p); goto out; } buf = skb_pull(skb, 3); if (buf[0] == BPDU_TYPE_CONFIG) { struct br_config_bpdu bpdu; if (!pskb_may_pull(skb, 32)) goto out; buf = skb->data; bpdu.topology_change = (buf[1] & 0x01) ? 1 : 0; bpdu.topology_change_ack = (buf[1] & 0x80) ? 1 : 0; bpdu.root.prio[0] = buf[2]; bpdu.root.prio[1] = buf[3]; bpdu.root.addr[0] = buf[4]; bpdu.root.addr[1] = buf[5]; bpdu.root.addr[2] = buf[6]; bpdu.root.addr[3] = buf[7]; bpdu.root.addr[4] = buf[8]; bpdu.root.addr[5] = buf[9]; bpdu.root_path_cost = (buf[10] << 24) | (buf[11] << 16) | (buf[12] << 8) | buf[13]; bpdu.bridge_id.prio[0] = buf[14]; bpdu.bridge_id.prio[1] = buf[15]; bpdu.bridge_id.addr[0] = buf[16]; bpdu.bridge_id.addr[1] = buf[17]; bpdu.bridge_id.addr[2] = buf[18]; bpdu.bridge_id.addr[3] = buf[19]; bpdu.bridge_id.addr[4] = buf[20]; bpdu.bridge_id.addr[5] = buf[21]; bpdu.port_id = (buf[22] << 8) | buf[23]; bpdu.message_age = br_get_ticks(buf+24); bpdu.max_age = br_get_ticks(buf+26); bpdu.hello_time = br_get_ticks(buf+28); bpdu.forward_delay = br_get_ticks(buf+30); if (bpdu.message_age > bpdu.max_age) { if (net_ratelimit()) br_notice(p->br, "port %u config from %pM" " (message_age %ul > max_age %ul)\n", p->port_no, eth_hdr(skb)->h_source, bpdu.message_age, bpdu.max_age); goto out; } br_received_config_bpdu(p, &bpdu); } else if (buf[0] == BPDU_TYPE_TCN) { br_received_tcn_bpdu(p); } out: spin_unlock(&br->lock); err: kfree_skb(skb); }
2 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/preempt.h> #include <linux/smp.h> #include <linux/completion.h> #include <asm/msr.h> static void __rdmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; int this_cpu = raw_smp_processor_id(); if (rv->msrs) reg = per_cpu_ptr(rv->msrs, this_cpu); else reg = &rv->reg; rdmsr(rv->msr_no, reg->l, reg->h); } static void __wrmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; int this_cpu = raw_smp_processor_id(); if (rv->msrs) reg = per_cpu_ptr(rv->msrs, this_cpu); else reg = &rv->reg; wrmsr(rv->msr_no, reg->l, reg->h); } int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *l = rv.reg.l; *h = rv.reg.h; return err; } EXPORT_SYMBOL(rdmsr_on_cpu); int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *q = rv.reg.q; return err; } EXPORT_SYMBOL(rdmsrl_on_cpu); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.l = l; rv.reg.h = h; err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); return err; } EXPORT_SYMBOL(wrmsr_on_cpu); int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.q = q; err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); return err; } EXPORT_SYMBOL(wrmsrl_on_cpu); static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs, void (*msr_func) (void *info)) { struct msr_info rv; int this_cpu; memset(&rv, 0, sizeof(rv)); rv.msrs = msrs; rv.msr_no = msr_no; this_cpu = get_cpu(); if (cpumask_test_cpu(this_cpu, mask)) msr_func(&rv); smp_call_function_many(mask, msr_func, &rv, 1); put_cpu(); } /* rdmsr on a bunch of CPUs * * @mask: which CPUs * @msr_no: which MSR * @msrs: array of MSR values * */ void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); } EXPORT_SYMBOL(rdmsr_on_cpus); /* * wrmsr on a bunch of CPUs * * @mask: which CPUs * @msr_no: which MSR * @msrs: array of MSR values * */ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); } EXPORT_SYMBOL(wrmsr_on_cpus); struct msr_info_completion { struct msr_info msr; struct completion done; }; /* These "safe" variants are slower and should be used when the target MSR may not actually exist. */ static void __rdmsr_safe_on_cpu(void *info) { struct msr_info_completion *rv = info; rv->msr.err = rdmsr_safe(rv->msr.msr_no, &rv->msr.reg.l, &rv->msr.reg.h); complete(&rv->done); } static void __wrmsr_safe_on_cpu(void *info) { struct msr_info *rv = info; rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); } int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { struct msr_info_completion rv; call_single_data_t csd; int err; INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); memset(&rv, 0, sizeof(rv)); init_completion(&rv.done); rv.msr.msr_no = msr_no; err = smp_call_function_single_async(cpu, &csd); if (!err) { wait_for_completion(&rv.done); err = rv.msr.err; } *l = rv.msr.reg.l; *h = rv.msr.reg.h; return err; } EXPORT_SYMBOL(rdmsr_safe_on_cpu); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.l = l; rv.reg.h = h; err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_on_cpu); int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.q = q; err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsrl_safe_on_cpu); int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { u32 low, high; int err; err = rdmsr_safe_on_cpu(cpu, msr_no, &low, &high); *q = (u64)high << 32 | low; return err; } EXPORT_SYMBOL(rdmsrl_safe_on_cpu); /* * These variants are significantly slower, but allows control over * the entire 32-bit GPR set. */ static void __rdmsr_safe_regs_on_cpu(void *info) { struct msr_regs_info *rv = info; rv->err = rdmsr_safe_regs(rv->regs); } static void __wrmsr_safe_regs_on_cpu(void *info) { struct msr_regs_info *rv = info; rv->err = wrmsr_safe_regs(rv->regs); } int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; rv.regs = regs; rv.err = -EIO; err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; rv.regs = regs; rv.err = -EIO; err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
9 1440 1439 1436 410 17 17 17 17 8 3 6 16 1 8 1 8 97 4 6 4 6 2 2 91 88 1 37 27 7 1 1 1 1 1 17 2 10 4 11 4 9 5 23 8 16 11 3 6 1 89 2 1 2 88 93 94 89 33 19 37 89 95 2 94 7 7 2 5 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 /* * This file implement the Wireless Extensions core API. * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * * (As all part of the Linux kernel, this file is GPL) */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/wireless.h> #include <linux/uaccess.h> #include <linux/export.h> #include <net/cfg80211.h> #include <net/iw_handler.h> #include <net/netlink.h> #include <net/wext.h> #include <net/net_namespace.h> typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *, unsigned int, struct iw_request_info *, iw_handler); /* * Meta-data about all the standard Wireless Extension request we * know about. */ static const struct iw_ioctl_description standard_ioctl[] = { [IW_IOCTL_IDX(SIOCSIWCOMMIT)] = { .header_type = IW_HEADER_TYPE_NULL, }, [IW_IOCTL_IDX(SIOCGIWNAME)] = { .header_type = IW_HEADER_TYPE_CHAR, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWNWID)] = { .header_type = IW_HEADER_TYPE_PARAM, .flags = IW_DESCR_FLAG_EVENT, }, [IW_IOCTL_IDX(SIOCGIWNWID)] = { .header_type = IW_HEADER_TYPE_PARAM, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWFREQ)] = { .header_type = IW_HEADER_TYPE_FREQ, .flags = IW_DESCR_FLAG_EVENT, }, [IW_IOCTL_IDX(SIOCGIWFREQ)] = { .header_type = IW_HEADER_TYPE_FREQ, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWMODE)] = { .header_type = IW_HEADER_TYPE_UINT, .flags = IW_DESCR_FLAG_EVENT, }, [IW_IOCTL_IDX(SIOCGIWMODE)] = { .header_type = IW_HEADER_TYPE_UINT, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWSENS)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWSENS)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWRANGE)] = { .header_type = IW_HEADER_TYPE_NULL, }, [IW_IOCTL_IDX(SIOCGIWRANGE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = sizeof(struct iw_range), .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWPRIV)] = { .header_type = IW_HEADER_TYPE_NULL, }, [IW_IOCTL_IDX(SIOCGIWPRIV)] = { /* (handled directly by us) */ .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct iw_priv_args), .max_tokens = 16, .flags = IW_DESCR_FLAG_NOMAX, }, [IW_IOCTL_IDX(SIOCSIWSTATS)] = { .header_type = IW_HEADER_TYPE_NULL, }, [IW_IOCTL_IDX(SIOCGIWSTATS)] = { /* (handled directly by us) */ .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = sizeof(struct iw_statistics), .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWSPY)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct sockaddr), .max_tokens = IW_MAX_SPY, }, [IW_IOCTL_IDX(SIOCGIWSPY)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct sockaddr) + sizeof(struct iw_quality), .max_tokens = IW_MAX_SPY, }, [IW_IOCTL_IDX(SIOCSIWTHRSPY)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct iw_thrspy), .min_tokens = 1, .max_tokens = 1, }, [IW_IOCTL_IDX(SIOCGIWTHRSPY)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct iw_thrspy), .min_tokens = 1, .max_tokens = 1, }, [IW_IOCTL_IDX(SIOCSIWAP)] = { .header_type = IW_HEADER_TYPE_ADDR, }, [IW_IOCTL_IDX(SIOCGIWAP)] = { .header_type = IW_HEADER_TYPE_ADDR, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWMLME)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = sizeof(struct iw_mlme), .max_tokens = sizeof(struct iw_mlme), }, [IW_IOCTL_IDX(SIOCGIWAPLIST)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct sockaddr) + sizeof(struct iw_quality), .max_tokens = IW_MAX_AP, .flags = IW_DESCR_FLAG_NOMAX, }, [IW_IOCTL_IDX(SIOCSIWSCAN)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = 0, .max_tokens = sizeof(struct iw_scan_req), }, [IW_IOCTL_IDX(SIOCGIWSCAN)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_SCAN_MAX_DATA, .flags = IW_DESCR_FLAG_NOMAX, }, [IW_IOCTL_IDX(SIOCSIWESSID)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ESSID_MAX_SIZE, .flags = IW_DESCR_FLAG_EVENT, }, [IW_IOCTL_IDX(SIOCGIWESSID)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ESSID_MAX_SIZE, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWNICKN)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ESSID_MAX_SIZE, }, [IW_IOCTL_IDX(SIOCGIWNICKN)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ESSID_MAX_SIZE, }, [IW_IOCTL_IDX(SIOCSIWRATE)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWRATE)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWRTS)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWRTS)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWFRAG)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWFRAG)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWTXPOW)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWTXPOW)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWRETRY)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWRETRY)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWENCODE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ENCODING_TOKEN_MAX, .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT, }, [IW_IOCTL_IDX(SIOCGIWENCODE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ENCODING_TOKEN_MAX, .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT, }, [IW_IOCTL_IDX(SIOCSIWPOWER)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWPOWER)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWGENIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_IOCTL_IDX(SIOCGIWGENIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_IOCTL_IDX(SIOCSIWAUTH)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWAUTH)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWENCODEEXT)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = sizeof(struct iw_encode_ext), .max_tokens = sizeof(struct iw_encode_ext) + IW_ENCODING_TOKEN_MAX, }, [IW_IOCTL_IDX(SIOCGIWENCODEEXT)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = sizeof(struct iw_encode_ext), .max_tokens = sizeof(struct iw_encode_ext) + IW_ENCODING_TOKEN_MAX, }, [IW_IOCTL_IDX(SIOCSIWPMKSA)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = sizeof(struct iw_pmksa), .max_tokens = sizeof(struct iw_pmksa), }, }; static const unsigned int standard_ioctl_num = ARRAY_SIZE(standard_ioctl); /* * Meta-data about all the additional standard Wireless Extension events * we know about. */ static const struct iw_ioctl_description standard_event[] = { [IW_EVENT_IDX(IWEVTXDROP)] = { .header_type = IW_HEADER_TYPE_ADDR, }, [IW_EVENT_IDX(IWEVQUAL)] = { .header_type = IW_HEADER_TYPE_QUAL, }, [IW_EVENT_IDX(IWEVCUSTOM)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_CUSTOM_MAX, }, [IW_EVENT_IDX(IWEVREGISTERED)] = { .header_type = IW_HEADER_TYPE_ADDR, }, [IW_EVENT_IDX(IWEVEXPIRED)] = { .header_type = IW_HEADER_TYPE_ADDR, }, [IW_EVENT_IDX(IWEVGENIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_EVENT_IDX(IWEVMICHAELMICFAILURE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = sizeof(struct iw_michaelmicfailure), }, [IW_EVENT_IDX(IWEVASSOCREQIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_EVENT_IDX(IWEVASSOCRESPIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_EVENT_IDX(IWEVPMKIDCAND)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = sizeof(struct iw_pmkid_cand), }, }; static const unsigned int standard_event_num = ARRAY_SIZE(standard_event); /* Size (in bytes) of various events */ static const int event_type_size[] = { IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */ 0, IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */ 0, IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */ IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */ IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */ 0, IW_EV_POINT_LEN, /* Without variable payload */ IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */ IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ }; #ifdef CONFIG_COMPAT static const int compat_event_type_size[] = { IW_EV_COMPAT_LCP_LEN, /* IW_HEADER_TYPE_NULL */ 0, IW_EV_COMPAT_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */ 0, IW_EV_COMPAT_UINT_LEN, /* IW_HEADER_TYPE_UINT */ IW_EV_COMPAT_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */ IW_EV_COMPAT_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */ 0, IW_EV_COMPAT_POINT_LEN, /* Without variable payload */ IW_EV_COMPAT_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */ IW_EV_COMPAT_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ }; #endif /* IW event code */ void wireless_nlevent_flush(void) { struct sk_buff *skb; struct net *net; down_read(&net_rwsem); for_each_net(net) { while ((skb = skb_dequeue(&net->wext_nlevents))) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); } up_read(&net_rwsem); } EXPORT_SYMBOL_GPL(wireless_nlevent_flush); static int wext_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { /* * When a netdev changes state in any way, flush all pending messages * to avoid them going out in a strange order, e.g. RTM_NEWLINK after * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close() * or similar - all of which could otherwise happen due to delays from * schedule_work(). */ wireless_nlevent_flush(); return NOTIFY_OK; } static struct notifier_block wext_netdev_notifier = { .notifier_call = wext_netdev_notifier_call, }; static int __net_init wext_pernet_init(struct net *net) { skb_queue_head_init(&net->wext_nlevents); return 0; } static void __net_exit wext_pernet_exit(struct net *net) { skb_queue_purge(&net->wext_nlevents); } static struct pernet_operations wext_pernet_ops = { .init = wext_pernet_init, .exit = wext_pernet_exit, }; static int __init wireless_nlevent_init(void) { int err = register_pernet_subsys(&wext_pernet_ops); if (err) return err; err = register_netdevice_notifier(&wext_netdev_notifier); if (err) unregister_pernet_subsys(&wext_pernet_ops); return err; } subsys_initcall(wireless_nlevent_init); /* Process events generated by the wireless layer or the driver. */ static void wireless_nlevent_process(struct work_struct *work) { wireless_nlevent_flush(); } static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process); static struct nlmsghdr *rtnetlink_ifinfo_prep(struct net_device *dev, struct sk_buff *skb) { struct ifinfomsg *r; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, 0, 0, RTM_NEWLINK, sizeof(*r), 0); if (!nlh) return NULL; r = nlmsg_data(nlh); r->ifi_family = AF_UNSPEC; r->__ifi_pad = 0; r->ifi_type = dev->type; r->ifi_index = dev->ifindex; r->ifi_flags = dev_get_flags(dev); r->ifi_change = 0; /* Wireless changes don't affect those flags */ if (nla_put_string(skb, IFLA_IFNAME, dev->name)) goto nla_put_failure; return nlh; nla_put_failure: nlmsg_cancel(skb, nlh); return NULL; } /* * Main event dispatcher. Called from other parts and drivers. * Send the event on the appropriate channels. * May be called from interrupt context. */ void wireless_send_event(struct net_device * dev, unsigned int cmd, union iwreq_data * wrqu, const char * extra) { const struct iw_ioctl_description * descr = NULL; int extra_len = 0; struct iw_event *event; /* Mallocated whole event */ int event_len; /* Its size */ int hdr_len; /* Size of the event header */ int wrqu_off = 0; /* Offset in wrqu */ /* Don't "optimise" the following variable, it will crash */ unsigned int cmd_index; /* *MUST* be unsigned */ struct sk_buff *skb; struct nlmsghdr *nlh; struct nlattr *nla; #ifdef CONFIG_COMPAT struct __compat_iw_event *compat_event; struct compat_iw_point compat_wrqu; struct sk_buff *compskb; int ptr_len; #endif /* * Nothing in the kernel sends scan events with data, be safe. * This is necessary because we cannot fix up scan event data * for compat, due to being contained in 'extra', but normally * applications are required to retrieve the scan data anyway * and no data is included in the event, this codifies that * practice. */ if (WARN_ON(cmd == SIOCGIWSCAN && extra)) extra = NULL; /* Get the description of the Event */ if (cmd <= SIOCIWLAST) { cmd_index = IW_IOCTL_IDX(cmd); if (cmd_index < standard_ioctl_num) descr = &(standard_ioctl[cmd_index]); } else { cmd_index = IW_EVENT_IDX(cmd); if (cmd_index < standard_event_num) descr = &(standard_event[cmd_index]); } /* Don't accept unknown events */ if (descr == NULL) { /* Note : we don't return an error to the driver, because * the driver would not know what to do about it. It can't * return an error to the user, because the event is not * initiated by a user request. * The best the driver could do is to log an error message. * We will do it ourselves instead... */ netdev_err(dev, "(WE) : Invalid/Unknown Wireless Event (0x%04X)\n", cmd); return; } /* Check extra parameters and set extra_len */ if (descr->header_type == IW_HEADER_TYPE_POINT) { /* Check if number of token fits within bounds */ if (wrqu->data.length > descr->max_tokens) { netdev_err(dev, "(WE) : Wireless Event (cmd=0x%04X) too big (%d)\n", cmd, wrqu->data.length); return; } if (wrqu->data.length < descr->min_tokens) { netdev_err(dev, "(WE) : Wireless Event (cmd=0x%04X) too small (%d)\n", cmd, wrqu->data.length); return; } /* Calculate extra_len - extra is NULL for restricted events */ if (extra != NULL) extra_len = wrqu->data.length * descr->token_size; /* Always at an offset in wrqu */ wrqu_off = IW_EV_POINT_OFF; } /* Total length of the event */ hdr_len = event_type_size[descr->header_type]; event_len = hdr_len + extra_len; /* * The problem for 64/32 bit. * * On 64-bit, a regular event is laid out as follows: * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | * | event.len | event.cmd | p a d d i n g | * | wrqu data ... (with the correct size) | * * This padding exists because we manipulate event->u, * and 'event' is not packed. * * An iw_point event is laid out like this instead: * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | * | event.len | event.cmd | p a d d i n g | * | iwpnt.len | iwpnt.flg | p a d d i n g | * | extra data ... * * The second padding exists because struct iw_point is extended, * but this depends on the platform... * * On 32-bit, all the padding shouldn't be there. */ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; /* Send via the RtNetlink event channel */ nlh = rtnetlink_ifinfo_prep(dev, skb); if (WARN_ON(!nlh)) { kfree_skb(skb); return; } /* Add the wireless events in the netlink packet */ nla = nla_reserve(skb, IFLA_WIRELESS, event_len); if (!nla) { kfree_skb(skb); return; } event = nla_data(nla); /* Fill event - first clear to avoid data leaking */ memset(event, 0, hdr_len); event->len = event_len; event->cmd = cmd; memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN); if (extra_len) memcpy(((char *) event) + hdr_len, extra, extra_len); nlmsg_end(skb, nlh); #ifdef CONFIG_COMPAT hdr_len = compat_event_type_size[descr->header_type]; /* ptr_len is remaining size in event header apart from LCP */ ptr_len = hdr_len - IW_EV_COMPAT_LCP_LEN; event_len = hdr_len + extra_len; compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!compskb) { kfree_skb(skb); return; } /* Send via the RtNetlink event channel */ nlh = rtnetlink_ifinfo_prep(dev, compskb); if (WARN_ON(!nlh)) { kfree_skb(skb); kfree_skb(compskb); return; } /* Add the wireless events in the netlink packet */ nla = nla_reserve(compskb, IFLA_WIRELESS, event_len); if (!nla) { kfree_skb(skb); kfree_skb(compskb); return; } compat_event = nla_data(nla); compat_event->len = event_len; compat_event->cmd = cmd; if (descr->header_type == IW_HEADER_TYPE_POINT) { compat_wrqu.length = wrqu->data.length; compat_wrqu.flags = wrqu->data.flags; memcpy(compat_event->ptr_bytes, ((char *)&compat_wrqu) + IW_EV_COMPAT_POINT_OFF, ptr_len); if (extra_len) memcpy(&compat_event->ptr_bytes[ptr_len], extra, extra_len); } else { /* extra_len must be zero, so no if (extra) needed */ memcpy(compat_event->ptr_bytes, wrqu, ptr_len); } nlmsg_end(compskb, nlh); skb_shinfo(skb)->frag_list = compskb; #endif skb_queue_tail(&dev_net(dev)->wext_nlevents, skb); schedule_work(&wireless_nlevent_work); } EXPORT_SYMBOL(wireless_send_event); #ifdef CONFIG_CFG80211_WEXT static void wireless_warn_cfg80211_wext(void) { char name[sizeof(current->comm)]; pr_warn_once("warning: `%s' uses wireless extensions which will stop working for Wi-Fi 7 hardware; use nl80211\n", get_task_comm(name, current)); } #endif /* IW handlers */ struct iw_statistics *get_wireless_stats(struct net_device *dev) { #ifdef CONFIG_WIRELESS_EXT if ((dev->wireless_handlers != NULL) && (dev->wireless_handlers->get_wireless_stats != NULL)) return dev->wireless_handlers->get_wireless_stats(dev); #endif #ifdef CONFIG_CFG80211_WEXT if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy && dev->ieee80211_ptr->wiphy->wext && dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) { wireless_warn_cfg80211_wext(); if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) return NULL; return dev->ieee80211_ptr->wiphy->wext->get_wireless_stats(dev); } #endif /* not found */ return NULL; } /* noinline to avoid a bogus warning with -O3 */ static noinline int iw_handler_get_iwstats(struct net_device * dev, struct iw_request_info * info, union iwreq_data * wrqu, char * extra) { /* Get stats from the driver */ struct iw_statistics *stats; stats = get_wireless_stats(dev); if (stats) { /* Copy statistics to extra */ memcpy(extra, stats, sizeof(struct iw_statistics)); wrqu->data.length = sizeof(struct iw_statistics); /* Check if we need to clear the updated flag */ if (wrqu->data.flags != 0) stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; return 0; } else return -EOPNOTSUPP; } static iw_handler get_handler(struct net_device *dev, unsigned int cmd) { /* Don't "optimise" the following variable, it will crash */ unsigned int index; /* *MUST* be unsigned */ const struct iw_handler_def *handlers = NULL; #ifdef CONFIG_CFG80211_WEXT if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy) { wireless_warn_cfg80211_wext(); if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) return NULL; handlers = dev->ieee80211_ptr->wiphy->wext; } #endif #ifdef CONFIG_WIRELESS_EXT if (dev->wireless_handlers) handlers = dev->wireless_handlers; #endif if (!handlers) return NULL; /* Try as a standard command */ index = IW_IOCTL_IDX(cmd); if (index < handlers->num_standard) return handlers->standard[index]; #ifdef CONFIG_WEXT_PRIV /* Try as a private command */ index = cmd - SIOCIWFIRSTPRIV; if (index < handlers->num_private) return handlers->private[index]; #endif /* Not found */ return NULL; } static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd, const struct iw_ioctl_description *descr, iw_handler handler, struct net_device *dev, struct iw_request_info *info) { int err, extra_size, user_length = 0, essid_compat = 0; char *extra; /* Calculate space needed by arguments. Always allocate * for max space. */ extra_size = descr->max_tokens * descr->token_size; /* Check need for ESSID compatibility for WE < 21 */ switch (cmd) { case SIOCSIWESSID: case SIOCGIWESSID: case SIOCSIWNICKN: case SIOCGIWNICKN: if (iwp->length == descr->max_tokens + 1) essid_compat = 1; else if (IW_IS_SET(cmd) && (iwp->length != 0)) { char essid[IW_ESSID_MAX_SIZE + 1]; unsigned int len; len = iwp->length * descr->token_size; if (len > IW_ESSID_MAX_SIZE) return -EFAULT; err = copy_from_user(essid, iwp->pointer, len); if (err) return -EFAULT; if (essid[iwp->length - 1] == '\0') essid_compat = 1; } break; default: break; } iwp->length -= essid_compat; /* Check what user space is giving us */ if (IW_IS_SET(cmd)) { /* Check NULL pointer */ if (!iwp->pointer && iwp->length != 0) return -EFAULT; /* Check if number of token fits within bounds */ if (iwp->length > descr->max_tokens) return -E2BIG; if (iwp->length < descr->min_tokens) return -EINVAL; } else { /* Check NULL pointer */ if (!iwp->pointer) return -EFAULT; /* Save user space buffer size for checking */ user_length = iwp->length; /* Don't check if user_length > max to allow forward * compatibility. The test user_length < min is * implied by the test at the end. */ /* Support for very large requests */ if ((descr->flags & IW_DESCR_FLAG_NOMAX) && (user_length > descr->max_tokens)) { /* Allow userspace to GET more than max so * we can support any size GET requests. * There is still a limit : -ENOMEM. */ extra_size = user_length * descr->token_size; /* Note : user_length is originally a __u16, * and token_size is controlled by us, * so extra_size won't get negative and * won't overflow... */ } } /* Sanity-check to ensure we never end up _allocating_ zero * bytes of data for extra. */ if (extra_size <= 0) return -EFAULT; /* kzalloc() ensures NULL-termination for essid_compat. */ extra = kzalloc(extra_size, GFP_KERNEL); if (!extra) return -ENOMEM; /* If it is a SET, get all the extra data in here */ if (IW_IS_SET(cmd) && (iwp->length != 0)) { if (copy_from_user(extra, iwp->pointer, iwp->length * descr->token_size)) { err = -EFAULT; goto out; } if (cmd == SIOCSIWENCODEEXT) { struct iw_encode_ext *ee = (void *) extra; if (iwp->length < sizeof(*ee) + ee->key_len) { err = -EFAULT; goto out; } } } if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) { /* * If this is a GET, but not NOMAX, it means that the extra * data is not bounded by userspace, but by max_tokens. Thus * set the length to max_tokens. This matches the extra data * allocation. * The driver should fill it with the number of tokens it * provided, and it may check iwp->length rather than having * knowledge of max_tokens. If the driver doesn't change the * iwp->length, this ioctl just copies back max_token tokens * filled with zeroes. Hopefully the driver isn't claiming * them to be valid data. */ iwp->length = descr->max_tokens; } err = handler(dev, info, (union iwreq_data *) iwp, extra); iwp->length += essid_compat; /* If we have something to return to the user */ if (!err && IW_IS_GET(cmd)) { /* Check if there is enough buffer up there */ if (user_length < iwp->length) { err = -E2BIG; goto out; } if (copy_to_user(iwp->pointer, extra, iwp->length * descr->token_size)) { err = -EFAULT; goto out; } } /* Generate an event to notify listeners of the change */ if ((descr->flags & IW_DESCR_FLAG_EVENT) && ((err == 0) || (err == -EIWCOMMIT))) { union iwreq_data *data = (union iwreq_data *) iwp; if (descr->flags & IW_DESCR_FLAG_RESTRICT) /* If the event is restricted, don't * export the payload. */ wireless_send_event(dev, cmd, data, NULL); else wireless_send_event(dev, cmd, data, extra); } out: kfree(extra); return err; } /* * Call the commit handler in the driver * (if exist and if conditions are right) * * Note : our current commit strategy is currently pretty dumb, * but we will be able to improve on that... * The goal is to try to agreagate as many changes as possible * before doing the commit. Drivers that will define a commit handler * are usually those that need a reset after changing parameters, so * we want to minimise the number of reset. * A cool idea is to use a timer : at each "set" command, we re-set the * timer, when the timer eventually fires, we call the driver. * Hopefully, more on that later. * * Also, I'm waiting to see how many people will complain about the * netif_running(dev) test. I'm open on that one... * Hopefully, the driver will remember to do a commit in "open()" ;-) */ int call_commit_handler(struct net_device *dev) { #ifdef CONFIG_WIRELESS_EXT if (netif_running(dev) && dev->wireless_handlers && dev->wireless_handlers->standard[0]) /* Call the commit handler on the driver */ return dev->wireless_handlers->standard[0](dev, NULL, NULL, NULL); else return 0; /* Command completed successfully */ #else /* cfg80211 has no commit */ return 0; #endif } /* * Main IOCTl dispatcher. * Check the type of IOCTL and call the appropriate wrapper... */ static int wireless_process_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd, struct iw_request_info *info, wext_ioctl_func standard, wext_ioctl_func private) { struct net_device *dev; iw_handler handler; /* Permissions are already checked in dev_ioctl() before calling us. * The copy_to/from_user() of ifr is also dealt with in there */ /* Make sure the device exist */ if ((dev = __dev_get_by_name(net, iwr->ifr_name)) == NULL) return -ENODEV; /* A bunch of special cases, then the generic case... * Note that 'cmd' is already filtered in dev_ioctl() with * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */ if (cmd == SIOCGIWSTATS) return standard(dev, iwr, cmd, info, &iw_handler_get_iwstats); #ifdef CONFIG_WEXT_PRIV if (cmd == SIOCGIWPRIV && dev->wireless_handlers) return standard(dev, iwr, cmd, info, iw_handler_get_private); #endif /* Basic check */ if (!netif_device_present(dev)) return -ENODEV; /* New driver API : try to find the handler */ handler = get_handler(dev, cmd); if (handler) { /* Standard and private are not the same */ if (cmd < SIOCIWFIRSTPRIV) return standard(dev, iwr, cmd, info, handler); else if (private) return private(dev, iwr, cmd, info, handler); } return -EOPNOTSUPP; } /* If command is `set a parameter', or `get the encoding parameters', * check if the user has the right to do it. */ static int wext_permission_check(unsigned int cmd) { if ((IW_IS_SET(cmd) || cmd == SIOCGIWENCODE || cmd == SIOCGIWENCODEEXT) && !capable(CAP_NET_ADMIN)) return -EPERM; return 0; } /* entry point from dev ioctl */ static int wext_ioctl_dispatch(struct net *net, struct iwreq *iwr, unsigned int cmd, struct iw_request_info *info, wext_ioctl_func standard, wext_ioctl_func private) { int ret = wext_permission_check(cmd); if (ret) return ret; dev_load(net, iwr->ifr_name); rtnl_lock(); ret = wireless_process_ioctl(net, iwr, cmd, info, standard, private); rtnl_unlock(); return ret; } /* * Wrapper to call a standard Wireless Extension handler. * We do various checks and also take care of moving data between * user space and kernel space. */ static int ioctl_standard_call(struct net_device * dev, struct iwreq *iwr, unsigned int cmd, struct iw_request_info *info, iw_handler handler) { const struct iw_ioctl_description * descr; int ret = -EINVAL; /* Get the description of the IOCTL */ if (IW_IOCTL_IDX(cmd) >= standard_ioctl_num) return -EOPNOTSUPP; descr = &(standard_ioctl[IW_IOCTL_IDX(cmd)]); /* Check if we have a pointer to user space data or not */ if (descr->header_type != IW_HEADER_TYPE_POINT) { /* No extra arguments. Trivial to handle */ ret = handler(dev, info, &(iwr->u), NULL); /* Generate an event to notify listeners of the change */ if ((descr->flags & IW_DESCR_FLAG_EVENT) && ((ret == 0) || (ret == -EIWCOMMIT))) wireless_send_event(dev, cmd, &(iwr->u), NULL); } else { ret = ioctl_standard_iw_point(&iwr->u.data, cmd, descr, handler, dev, info); } /* Call commit handler if needed and defined */ if (ret == -EIWCOMMIT) ret = call_commit_handler(dev); /* Here, we will generate the appropriate event if needed */ return ret; } int wext_handle_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct iw_request_info info = { .cmd = cmd, .flags = 0 }; struct iwreq iwr; int ret; if (copy_from_user(&iwr, arg, sizeof(iwr))) return -EFAULT; iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; ret = wext_ioctl_dispatch(net, &iwr, cmd, &info, ioctl_standard_call, ioctl_private_call); if (ret >= 0 && IW_IS_GET(cmd) && copy_to_user(arg, &iwr, sizeof(struct iwreq))) return -EFAULT; return ret; } #ifdef CONFIG_COMPAT static int compat_standard_call(struct net_device *dev, struct iwreq *iwr, unsigned int cmd, struct iw_request_info *info, iw_handler handler) { const struct iw_ioctl_description *descr; struct compat_iw_point *iwp_compat; struct iw_point iwp; int err; descr = standard_ioctl + IW_IOCTL_IDX(cmd); if (descr->header_type != IW_HEADER_TYPE_POINT) return ioctl_standard_call(dev, iwr, cmd, info, handler); iwp_compat = (struct compat_iw_point *) &iwr->u.data; iwp.pointer = compat_ptr(iwp_compat->pointer); iwp.length = iwp_compat->length; iwp.flags = iwp_compat->flags; err = ioctl_standard_iw_point(&iwp, cmd, descr, handler, dev, info); iwp_compat->pointer = ptr_to_compat(iwp.pointer); iwp_compat->length = iwp.length; iwp_compat->flags = iwp.flags; return err; } int compat_wext_handle_ioctl(struct net *net, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct iw_request_info info; struct iwreq iwr; char *colon; int ret; if (copy_from_user(&iwr, argp, sizeof(struct iwreq))) return -EFAULT; iwr.ifr_name[IFNAMSIZ-1] = 0; colon = strchr(iwr.ifr_name, ':'); if (colon) *colon = 0; info.cmd = cmd; info.flags = IW_REQUEST_FLAG_COMPAT; ret = wext_ioctl_dispatch(net, &iwr, cmd, &info, compat_standard_call, compat_private_call); if (ret >= 0 && IW_IS_GET(cmd) && copy_to_user(argp, &iwr, sizeof(struct iwreq))) return -EFAULT; return ret; } #endif char *iwe_stream_add_event(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len) { int lcp_len = iwe_stream_lcp_len(info); event_len = iwe_stream_event_len_adjust(info, event_len); /* Check if it's possible */ if (likely((stream + event_len) < ends)) { iwe->len = event_len; /* Beware of alignement issues on 64 bits */ memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); memcpy(stream + lcp_len, &iwe->u, event_len - lcp_len); stream += event_len; } return stream; } EXPORT_SYMBOL(iwe_stream_add_event); char *iwe_stream_add_point(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra) { int event_len = iwe_stream_point_len(info) + iwe->u.data.length; int point_len = iwe_stream_point_len(info); int lcp_len = iwe_stream_lcp_len(info); /* Check if it's possible */ if (likely((stream + event_len) < ends)) { iwe->len = event_len; memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); memcpy(stream + lcp_len, ((char *) &iwe->u) + IW_EV_POINT_OFF, IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); if (iwe->u.data.length && extra) memcpy(stream + point_len, extra, iwe->u.data.length); stream += event_len; } return stream; } EXPORT_SYMBOL(iwe_stream_add_point); char *iwe_stream_add_value(struct iw_request_info *info, char *event, char *value, char *ends, struct iw_event *iwe, int event_len) { int lcp_len = iwe_stream_lcp_len(info); /* Don't duplicate LCP */ event_len -= IW_EV_LCP_LEN; /* Check if it's possible */ if (likely((value + event_len) < ends)) { /* Add new value */ memcpy(value, &iwe->u, event_len); value += event_len; /* Patch LCP */ iwe->len = value - event; memcpy(event, (char *) iwe, lcp_len); } return value; } EXPORT_SYMBOL(iwe_stream_add_value);
3 12 37 102 48 2647 2649 13 103 2641 106 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 // SPDX-License-Identifier: GPL-2.0 /* * Workingset detection * * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> /* * Double CLOCK lists * * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list * are promoted to the active list, to protect them from reclaim, * whereas active pages are demoted to the inactive list when the * active list grows too big. * * fault ------------------------+ * | * +--------------+ | +-------------+ * reclaim <- | inactive | <-+-- demotion | active | <--+ * +--------------+ +-------------+ | * | | * +-------------- promotion ------------------+ * * * Access frequency and refault distance * * A workload is thrashing when its pages are frequently used but they * are evicted from the inactive list every time before another access * would have promoted them to the active list. * * In cases where the average access distance between thrashing pages * is bigger than the size of memory there is nothing that can be * done - the thrashing set could never fit into memory under any * circumstance. * * However, the average access distance could be bigger than the * inactive list, yet smaller than the size of memory. In this case, * the set could fit into memory if it weren't for the currently * active pages - which may be used more, hopefully less frequently: * * +-memory available to cache-+ * | | * +-inactive------+-active----+ * a b | c d e f g h i | J K L M N | * +---------------+-----------+ * * It is prohibitively expensive to accurately track access frequency * of pages. But a reasonable approximation can be made to measure * thrashing on the inactive list, after which refaulting pages can be * activated optimistically to compete with the existing active pages. * * Approximating inactive page access frequency - Observations: * * 1. When a page is accessed for the first time, it is added to the * head of the inactive list, slides every existing inactive page * towards the tail by one slot, and pushes the current tail page * out of memory. * * 2. When a page is accessed for the second time, it is promoted to * the active list, shrinking the inactive list by one slot. This * also slides all inactive pages that were faulted into the cache * more recently than the activated page towards the tail of the * inactive list. * * Thus: * * 1. The sum of evictions and activations between any two points in * time indicate the minimum number of inactive pages accessed in * between. * * 2. Moving one inactive page N page slots towards the tail of the * list requires at least N inactive page accesses. * * Combining these: * * 1. When a page is finally evicted from memory, the number of * inactive pages accessed while the page was in cache is at least * the number of page slots on the inactive list. * * 2. In addition, measuring the sum of evictions and activations (E) * at the time of a page's eviction, and comparing it to another * reading (R) at the time the page faults back into memory tells * the minimum number of accesses while the page was not cached. * This is called the refault distance. * * Because the first access of the page was the fault and the second * access the refault, we combine the in-cache distance with the * out-of-cache distance to get the complete minimum access distance * of this page: * * NR_inactive + (R - E) * * And knowing the minimum access distance of a page, we can easily * tell if the page would be able to stay in cache assuming all page * slots in the cache were available: * * NR_inactive + (R - E) <= NR_inactive + NR_active * * If we have swap we should consider about NR_inactive_anon and * NR_active_anon, so for page cache and anonymous respectively: * * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file * + NR_inactive_anon + NR_active_anon * * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon * + NR_inactive_file + NR_active_file * * Which can be further simplified to: * * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon * * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list * had (R - E) more page slots, the page would not have been evicted * in between accesses, but activated instead. And on a full system, * the only thing eating into inactive list space is active pages. * * * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given * time there is actually a good chance that pages on the active list * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at * least (R - E) pages in the userspace workingset, the refaulting page * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. * * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * * Refaulting active pages * * If on the other hand the refaulting pages have recently been * deactivated, it means that the active list is no longer protecting * actively used cache from reclaim. The cache is NOT transitioning to * a different workingset; the existing workingset is thrashing in the * space allocated to the page cache. * * * Implementation * * For each node's LRU lists, a counter for inactive evictions and * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ #define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; } #ifdef CONFIG_LRU_GEN static void *lru_gen_eviction(struct folio *folio) { int hist; unsigned long token; unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); int tier = lru_tier_from_refs(refs); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; unsigned long min_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); } static void lru_gen_refault(struct folio *folio, void *shadow) { bool recent; int hist, tier, refs; bool workingset; unsigned long token; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); rcu_read_lock(); recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); if (!recent) goto unlock; lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); /* see the comment in folio_lru_refs() */ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; tier = lru_tier_from_refs(refs); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); /* * Count the following two cases as stalls: * 1. For pages accessed through page tables, hotter pages pushed out * hot pages which refaulted immediately. * 2. For pages accessed multiple times through file descriptors, * they would have been protected by sort_folio(). */ if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } unlock: rcu_read_unlock(); } #else /* !CONFIG_LRU_GEN */ static void *lru_gen_eviction(struct folio *folio) { return NULL; } static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; } static void lru_gen_refault(struct folio *folio, void *shadow) { } #endif /* CONFIG_LRU_GEN */ /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as * well, in order for the refault distances later on to be comparable * to the in-memory dimensions. This function allows reclaim and LRU * operations to drive the non-resident aging along in parallel. */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a * round-robin fashion. That means that each cgroup has an LRU * order that is composed of the LRU orders of its child * cgroups; and every page has an LRU position not just in the * cgroup that owns it, but in all of that group's ancestors. * * So when the physical inactive list of a leaf cgroup ages, * the virtual inactive lists of all its parents, including * the root cgroup's, age as well. */ do { atomic_long_add(nr_pages, &lruvec->nonresident_age); } while ((lruvec = parent_lruvec(lruvec))); } /** * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim * @folio: the folio being evicted * * Return: a shadow entry to be stored in @folio->mapping->i_pages in place * of the evicted @folio so that a later refault can be detected. */ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; /* Folio is fully exclusive and pins folio's memory cgroup pointer */ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } /** * workingset_test_recent - tests if the shadow entry is for a folio that was * recently evicted. Also fills in @workingset with the value unpacked from * shadow. * @shadow: the shadow entry to be tested. * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ bool workingset_test_recent(void *shadow, bool file, bool *workingset) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) return lru_gen_test_recent(shadow, file, &eviction_lruvec, &eviction, workingset); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging * algorithm's job to shake out the minimum access frequency * for the active cache. * * XXX: On !CONFIG_MEMCG, this will always return NULL; it * would be better if the root_mem_cgroup existed in all * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) return false; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the * nonresident_age to lap a shadow entry in the field, which * can then result in a false small refault distance, leading * to a false activation should this old entry actually * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON); } } return refault_distance <= workingset_size; } /** * workingset_refault - Evaluate the refault of a previously evicted folio. * @folio: The freshly allocated replacement folio. * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; long nr; if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; } /* Flush stats (and potentially sleep) before holding RCU read lock */ mem_cgroup_flush_stats_ratelimited(); rcu_read_lock(); /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. */ nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset)) goto out; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); /* * XXX: Move to folio_add_lru() when it supports new vs * putback */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } out: rcu_read_unlock(); } /** * workingset_activation - note a page activation * @folio: Folio that is being activated. */ void workingset_activation(struct folio *folio) { struct mem_cgroup *memcg; rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. * * XXX: See workingset_refault() - this should return * root_mem_cgroup even for !CONFIG_MEMCG. */ memcg = folio_memcg_rcu(folio); if (!mem_cgroup_disabled() && !memcg) goto out; workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); out: rcu_read_unlock(); } /* * Shadow entries reflect the share of the working set that does not * fit into memory, so their number depends on the access pattern of * the workload. In most cases, they will refault or get reclaimed * along with the inode, but a (malicious) workload that streams * through files with a total size several times that of available * memory, while preventing the inodes from being reclaimed, can * create excessive amounts of shadow nodes. To keep a lid on this, * track shadow nodes and reclaim them when they grow way past the * point where they would still be useful. */ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct address_space *mapping; /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ mapping = container_of(node->array, struct address_space, i_pages); lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } } static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long max_nodes; unsigned long nodes; unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); if (!nodes) return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. * * The size of the active list converges toward 100% of * overall page cache as memory grows, with only a tiny * inactive list. Assume the total cache size for that. * * Nodes might be sparsely populated, with only one shadow * entry in the extreme case. Obviously, we cannot keep one * node for every eligible shadow entry, so compromise on a * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; int i; mem_cgroup_flush_stats(); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __must_hold(lru_lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; int ret; /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } /* For page cache we need to hold i_lock */ if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); spin_unlock_irq(lru_lock); ret = LRU_RETRY; goto out; } } list_lru_isolate(lru, item); __dec_lruvec_kmem_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); /* * The nodes should only contain one or more shadow entries, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; out: cond_resched(); spin_lock_irq(lru_lock); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { /* list_lru lock nests inside the IRQ-safe i_pages lock */ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, NULL); } /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. */ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-shadow"); if (!workingset_shadow_shrinker) goto err; ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, workingset_shadow_shrinker); if (ret) goto err_list_lru; workingset_shadow_shrinker->count_objects = count_shadow_nodes; workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; /* ->count reports only fully expendable nodes */ workingset_shadow_shrinker->seeks = 0; shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: shrinker_free(workingset_shadow_shrinker); err: return ret; } module_init(workingset_init);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat Kone[+] driver for Linux * * Copyright (c) 2010 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ /* * Roccat Kone[+] is an updated/improved version of the Kone with more memory * and functionality and without the non-standard behaviours the Kone had. * KoneXTD has same capabilities but updated sensor. */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hid-roccat.h> #include "hid-ids.h" #include "hid-roccat-common.h" #include "hid-roccat-koneplus.h" static uint profile_numbers[5] = {0, 1, 2, 3, 4}; static void koneplus_profile_activated(struct koneplus_device *koneplus, uint new_profile) { koneplus->actual_profile = new_profile; } static int koneplus_send_control(struct usb_device *usb_dev, uint value, enum koneplus_control_requests request) { struct roccat_common2_control control; if ((request == KONEPLUS_CONTROL_REQUEST_PROFILE_SETTINGS || request == KONEPLUS_CONTROL_REQUEST_PROFILE_BUTTONS) && value > 4) return -EINVAL; control.command = ROCCAT_COMMON_COMMAND_CONTROL; control.value = value; control.request = request; return roccat_common2_send_with_status(usb_dev, ROCCAT_COMMON_COMMAND_CONTROL, &control, sizeof(struct roccat_common2_control)); } /* retval is 0-4 on success, < 0 on error */ static int koneplus_get_actual_profile(struct usb_device *usb_dev) { struct koneplus_actual_profile buf; int retval; retval = roccat_common2_receive(usb_dev, KONEPLUS_COMMAND_ACTUAL_PROFILE, &buf, KONEPLUS_SIZE_ACTUAL_PROFILE); return retval ? retval : buf.actual_profile; } static int koneplus_set_actual_profile(struct usb_device *usb_dev, int new_profile) { struct koneplus_actual_profile buf; buf.command = KONEPLUS_COMMAND_ACTUAL_PROFILE; buf.size = KONEPLUS_SIZE_ACTUAL_PROFILE; buf.actual_profile = new_profile; return roccat_common2_send_with_status(usb_dev, KONEPLUS_COMMAND_ACTUAL_PROFILE, &buf, KONEPLUS_SIZE_ACTUAL_PROFILE); } static ssize_t koneplus_sysfs_read(struct file *fp, struct kobject *kobj, char *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct koneplus_device *koneplus = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off >= real_size) return 0; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&koneplus->koneplus_lock); retval = roccat_common2_receive(usb_dev, command, buf, real_size); mutex_unlock(&koneplus->koneplus_lock); if (retval) return retval; return real_size; } static ssize_t koneplus_sysfs_write(struct file *fp, struct kobject *kobj, void const *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct koneplus_device *koneplus = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&koneplus->koneplus_lock); retval = roccat_common2_send_with_status(usb_dev, command, buf, real_size); mutex_unlock(&koneplus->koneplus_lock); if (retval) return retval; return real_size; } #define KONEPLUS_SYSFS_W(thingy, THINGY) \ static ssize_t koneplus_sysfs_write_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, char *buf, \ loff_t off, size_t count) \ { \ return koneplus_sysfs_write(fp, kobj, buf, off, count, \ KONEPLUS_SIZE_ ## THINGY, KONEPLUS_COMMAND_ ## THINGY); \ } #define KONEPLUS_SYSFS_R(thingy, THINGY) \ static ssize_t koneplus_sysfs_read_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, char *buf, \ loff_t off, size_t count) \ { \ return koneplus_sysfs_read(fp, kobj, buf, off, count, \ KONEPLUS_SIZE_ ## THINGY, KONEPLUS_COMMAND_ ## THINGY); \ } #define KONEPLUS_SYSFS_RW(thingy, THINGY) \ KONEPLUS_SYSFS_W(thingy, THINGY) \ KONEPLUS_SYSFS_R(thingy, THINGY) #define KONEPLUS_BIN_ATTRIBUTE_RW(thingy, THINGY) \ KONEPLUS_SYSFS_RW(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0660 }, \ .size = KONEPLUS_SIZE_ ## THINGY, \ .read = koneplus_sysfs_read_ ## thingy, \ .write = koneplus_sysfs_write_ ## thingy \ } #define KONEPLUS_BIN_ATTRIBUTE_R(thingy, THINGY) \ KONEPLUS_SYSFS_R(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0440 }, \ .size = KONEPLUS_SIZE_ ## THINGY, \ .read = koneplus_sysfs_read_ ## thingy, \ } #define KONEPLUS_BIN_ATTRIBUTE_W(thingy, THINGY) \ KONEPLUS_SYSFS_W(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0220 }, \ .size = KONEPLUS_SIZE_ ## THINGY, \ .write = koneplus_sysfs_write_ ## thingy \ } KONEPLUS_BIN_ATTRIBUTE_W(control, CONTROL); KONEPLUS_BIN_ATTRIBUTE_W(talk, TALK); KONEPLUS_BIN_ATTRIBUTE_W(macro, MACRO); KONEPLUS_BIN_ATTRIBUTE_R(tcu_image, TCU_IMAGE); KONEPLUS_BIN_ATTRIBUTE_RW(info, INFO); KONEPLUS_BIN_ATTRIBUTE_RW(sensor, SENSOR); KONEPLUS_BIN_ATTRIBUTE_RW(tcu, TCU); KONEPLUS_BIN_ATTRIBUTE_RW(profile_settings, PROFILE_SETTINGS); KONEPLUS_BIN_ATTRIBUTE_RW(profile_buttons, PROFILE_BUTTONS); static ssize_t koneplus_sysfs_read_profilex_settings(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = koneplus_send_control(usb_dev, *(uint *)(attr->private), KONEPLUS_CONTROL_REQUEST_PROFILE_SETTINGS); if (retval) return retval; return koneplus_sysfs_read(fp, kobj, buf, off, count, KONEPLUS_SIZE_PROFILE_SETTINGS, KONEPLUS_COMMAND_PROFILE_SETTINGS); } static ssize_t koneplus_sysfs_read_profilex_buttons(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = koneplus_send_control(usb_dev, *(uint *)(attr->private), KONEPLUS_CONTROL_REQUEST_PROFILE_BUTTONS); if (retval) return retval; return koneplus_sysfs_read(fp, kobj, buf, off, count, KONEPLUS_SIZE_PROFILE_BUTTONS, KONEPLUS_COMMAND_PROFILE_BUTTONS); } #define PROFILE_ATTR(number) \ static struct bin_attribute bin_attr_profile##number##_settings = { \ .attr = { .name = "profile" #number "_settings", .mode = 0440 }, \ .size = KONEPLUS_SIZE_PROFILE_SETTINGS, \ .read = koneplus_sysfs_read_profilex_settings, \ .private = &profile_numbers[number-1], \ }; \ static struct bin_attribute bin_attr_profile##number##_buttons = { \ .attr = { .name = "profile" #number "_buttons", .mode = 0440 }, \ .size = KONEPLUS_SIZE_PROFILE_BUTTONS, \ .read = koneplus_sysfs_read_profilex_buttons, \ .private = &profile_numbers[number-1], \ }; PROFILE_ATTR(1); PROFILE_ATTR(2); PROFILE_ATTR(3); PROFILE_ATTR(4); PROFILE_ATTR(5); static ssize_t koneplus_sysfs_show_actual_profile(struct device *dev, struct device_attribute *attr, char *buf) { struct koneplus_device *koneplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return snprintf(buf, PAGE_SIZE, "%d\n", koneplus->actual_profile); } static ssize_t koneplus_sysfs_set_actual_profile(struct device *dev, struct device_attribute *attr, char const *buf, size_t size) { struct koneplus_device *koneplus; struct usb_device *usb_dev; unsigned long profile; int retval; struct koneplus_roccat_report roccat_report; dev = dev->parent->parent; koneplus = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); retval = kstrtoul(buf, 10, &profile); if (retval) return retval; if (profile > 4) return -EINVAL; mutex_lock(&koneplus->koneplus_lock); retval = koneplus_set_actual_profile(usb_dev, profile); if (retval) { mutex_unlock(&koneplus->koneplus_lock); return retval; } koneplus_profile_activated(koneplus, profile); roccat_report.type = KONEPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE; roccat_report.data1 = profile + 1; roccat_report.data2 = 0; roccat_report.profile = profile + 1; roccat_report_event(koneplus->chrdev_minor, (uint8_t const *)&roccat_report); mutex_unlock(&koneplus->koneplus_lock); return size; } static DEVICE_ATTR(actual_profile, 0660, koneplus_sysfs_show_actual_profile, koneplus_sysfs_set_actual_profile); static DEVICE_ATTR(startup_profile, 0660, koneplus_sysfs_show_actual_profile, koneplus_sysfs_set_actual_profile); static ssize_t koneplus_sysfs_show_firmware_version(struct device *dev, struct device_attribute *attr, char *buf) { struct koneplus_device *koneplus; struct usb_device *usb_dev; struct koneplus_info info; dev = dev->parent->parent; koneplus = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); mutex_lock(&koneplus->koneplus_lock); roccat_common2_receive(usb_dev, KONEPLUS_COMMAND_INFO, &info, KONEPLUS_SIZE_INFO); mutex_unlock(&koneplus->koneplus_lock); return snprintf(buf, PAGE_SIZE, "%d\n", info.firmware_version); } static DEVICE_ATTR(firmware_version, 0440, koneplus_sysfs_show_firmware_version, NULL); static struct attribute *koneplus_attrs[] = { &dev_attr_actual_profile.attr, &dev_attr_startup_profile.attr, &dev_attr_firmware_version.attr, NULL, }; static struct bin_attribute *koneplus_bin_attributes[] = { &bin_attr_control, &bin_attr_talk, &bin_attr_macro, &bin_attr_tcu_image, &bin_attr_info, &bin_attr_sensor, &bin_attr_tcu, &bin_attr_profile_settings, &bin_attr_profile_buttons, &bin_attr_profile1_settings, &bin_attr_profile2_settings, &bin_attr_profile3_settings, &bin_attr_profile4_settings, &bin_attr_profile5_settings, &bin_attr_profile1_buttons, &bin_attr_profile2_buttons, &bin_attr_profile3_buttons, &bin_attr_profile4_buttons, &bin_attr_profile5_buttons, NULL, }; static const struct attribute_group koneplus_group = { .attrs = koneplus_attrs, .bin_attrs = koneplus_bin_attributes, }; static const struct attribute_group *koneplus_groups[] = { &koneplus_group, NULL, }; static const struct class koneplus_class = { .name = "koneplus", .dev_groups = koneplus_groups, }; static int koneplus_init_koneplus_device_struct(struct usb_device *usb_dev, struct koneplus_device *koneplus) { int retval; mutex_init(&koneplus->koneplus_lock); retval = koneplus_get_actual_profile(usb_dev); if (retval < 0) return retval; koneplus_profile_activated(koneplus, retval); return 0; } static int koneplus_init_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *usb_dev = interface_to_usbdev(intf); struct koneplus_device *koneplus; int retval; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { koneplus = kzalloc(sizeof(*koneplus), GFP_KERNEL); if (!koneplus) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } hid_set_drvdata(hdev, koneplus); retval = koneplus_init_koneplus_device_struct(usb_dev, koneplus); if (retval) { hid_err(hdev, "couldn't init struct koneplus_device\n"); goto exit_free; } retval = roccat_connect(&koneplus_class, hdev, sizeof(struct koneplus_roccat_report)); if (retval < 0) { hid_err(hdev, "couldn't init char dev\n"); } else { koneplus->chrdev_minor = retval; koneplus->roccat_claimed = 1; } } else { hid_set_drvdata(hdev, NULL); } return 0; exit_free: kfree(koneplus); return retval; } static void koneplus_remove_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct koneplus_device *koneplus; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { koneplus = hid_get_drvdata(hdev); if (koneplus->roccat_claimed) roccat_disconnect(koneplus->chrdev_minor); kfree(koneplus); } } static int koneplus_probe(struct hid_device *hdev, const struct hid_device_id *id) { int retval; if (!hid_is_usb(hdev)) return -EINVAL; retval = hid_parse(hdev); if (retval) { hid_err(hdev, "parse failed\n"); goto exit; } retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (retval) { hid_err(hdev, "hw start failed\n"); goto exit; } retval = koneplus_init_specials(hdev); if (retval) { hid_err(hdev, "couldn't install mouse\n"); goto exit_stop; } return 0; exit_stop: hid_hw_stop(hdev); exit: return retval; } static void koneplus_remove(struct hid_device *hdev) { koneplus_remove_specials(hdev); hid_hw_stop(hdev); } static void koneplus_keep_values_up_to_date(struct koneplus_device *koneplus, u8 const *data) { struct koneplus_mouse_report_button const *button_report; switch (data[0]) { case KONEPLUS_MOUSE_REPORT_NUMBER_BUTTON: button_report = (struct koneplus_mouse_report_button const *)data; switch (button_report->type) { case KONEPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE: koneplus_profile_activated(koneplus, button_report->data1 - 1); break; } break; } } static void koneplus_report_to_chrdev(struct koneplus_device const *koneplus, u8 const *data) { struct koneplus_roccat_report roccat_report; struct koneplus_mouse_report_button const *button_report; if (data[0] != KONEPLUS_MOUSE_REPORT_NUMBER_BUTTON) return; button_report = (struct koneplus_mouse_report_button const *)data; if ((button_report->type == KONEPLUS_MOUSE_REPORT_BUTTON_TYPE_QUICKLAUNCH || button_report->type == KONEPLUS_MOUSE_REPORT_BUTTON_TYPE_TIMER) && button_report->data2 != KONEPLUS_MOUSE_REPORT_BUTTON_ACTION_PRESS) return; roccat_report.type = button_report->type; roccat_report.data1 = button_report->data1; roccat_report.data2 = button_report->data2; roccat_report.profile = koneplus->actual_profile + 1; roccat_report_event(koneplus->chrdev_minor, (uint8_t const *)&roccat_report); } static int koneplus_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct koneplus_device *koneplus = hid_get_drvdata(hdev); if (intf->cur_altsetting->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_MOUSE) return 0; if (koneplus == NULL) return 0; koneplus_keep_values_up_to_date(koneplus, data); if (koneplus->roccat_claimed) koneplus_report_to_chrdev(koneplus, data); return 0; } static const struct hid_device_id koneplus_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KONEPLUS) }, { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KONEXTD) }, { } }; MODULE_DEVICE_TABLE(hid, koneplus_devices); static struct hid_driver koneplus_driver = { .name = "koneplus", .id_table = koneplus_devices, .probe = koneplus_probe, .remove = koneplus_remove, .raw_event = koneplus_raw_event }; static int __init koneplus_init(void) { int retval; /* class name has to be same as driver name */ retval = class_register(&koneplus_class); if (retval) return retval; retval = hid_register_driver(&koneplus_driver); if (retval) class_unregister(&koneplus_class); return retval; } static void __exit koneplus_exit(void) { hid_unregister_driver(&koneplus_driver); class_unregister(&koneplus_class); } module_init(koneplus_init); module_exit(koneplus_exit); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat Kone[+]/XTD driver"); MODULE_LICENSE("GPL v2");
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat Pyra driver for Linux * * Copyright (c) 2010 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ /* * Roccat Pyra is a mobile gamer mouse which comes in wired and wireless * variant. Wireless variant is not tested. * Userland tools can be found at http://sourceforge.net/projects/roccat */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hid-roccat.h> #include "hid-ids.h" #include "hid-roccat-common.h" #include "hid-roccat-pyra.h" static uint profile_numbers[5] = {0, 1, 2, 3, 4}; static void profile_activated(struct pyra_device *pyra, unsigned int new_profile) { if (new_profile >= ARRAY_SIZE(pyra->profile_settings)) return; pyra->actual_profile = new_profile; pyra->actual_cpi = pyra->profile_settings[pyra->actual_profile].y_cpi; } static int pyra_send_control(struct usb_device *usb_dev, int value, enum pyra_control_requests request) { struct roccat_common2_control control; if ((request == PYRA_CONTROL_REQUEST_PROFILE_SETTINGS || request == PYRA_CONTROL_REQUEST_PROFILE_BUTTONS) && (value < 0 || value > 4)) return -EINVAL; control.command = ROCCAT_COMMON_COMMAND_CONTROL; control.value = value; control.request = request; return roccat_common2_send(usb_dev, ROCCAT_COMMON_COMMAND_CONTROL, &control, sizeof(struct roccat_common2_control)); } static int pyra_get_profile_settings(struct usb_device *usb_dev, struct pyra_profile_settings *buf, int number) { int retval; retval = pyra_send_control(usb_dev, number, PYRA_CONTROL_REQUEST_PROFILE_SETTINGS); if (retval) return retval; return roccat_common2_receive(usb_dev, PYRA_COMMAND_PROFILE_SETTINGS, buf, PYRA_SIZE_PROFILE_SETTINGS); } static int pyra_get_settings(struct usb_device *usb_dev, struct pyra_settings *buf) { return roccat_common2_receive(usb_dev, PYRA_COMMAND_SETTINGS, buf, PYRA_SIZE_SETTINGS); } static int pyra_set_settings(struct usb_device *usb_dev, struct pyra_settings const *settings) { return roccat_common2_send_with_status(usb_dev, PYRA_COMMAND_SETTINGS, settings, PYRA_SIZE_SETTINGS); } static ssize_t pyra_sysfs_read(struct file *fp, struct kobject *kobj, char *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off >= real_size) return 0; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&pyra->pyra_lock); retval = roccat_common2_receive(usb_dev, command, buf, real_size); mutex_unlock(&pyra->pyra_lock); if (retval) return retval; return real_size; } static ssize_t pyra_sysfs_write(struct file *fp, struct kobject *kobj, void const *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&pyra->pyra_lock); retval = roccat_common2_send_with_status(usb_dev, command, (void *)buf, real_size); mutex_unlock(&pyra->pyra_lock); if (retval) return retval; return real_size; } #define PYRA_SYSFS_W(thingy, THINGY) \ static ssize_t pyra_sysfs_write_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, char *buf, \ loff_t off, size_t count) \ { \ return pyra_sysfs_write(fp, kobj, buf, off, count, \ PYRA_SIZE_ ## THINGY, PYRA_COMMAND_ ## THINGY); \ } #define PYRA_SYSFS_R(thingy, THINGY) \ static ssize_t pyra_sysfs_read_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, char *buf, \ loff_t off, size_t count) \ { \ return pyra_sysfs_read(fp, kobj, buf, off, count, \ PYRA_SIZE_ ## THINGY, PYRA_COMMAND_ ## THINGY); \ } #define PYRA_SYSFS_RW(thingy, THINGY) \ PYRA_SYSFS_W(thingy, THINGY) \ PYRA_SYSFS_R(thingy, THINGY) #define PYRA_BIN_ATTRIBUTE_RW(thingy, THINGY) \ PYRA_SYSFS_RW(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0660 }, \ .size = PYRA_SIZE_ ## THINGY, \ .read = pyra_sysfs_read_ ## thingy, \ .write = pyra_sysfs_write_ ## thingy \ } #define PYRA_BIN_ATTRIBUTE_R(thingy, THINGY) \ PYRA_SYSFS_R(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0440 }, \ .size = PYRA_SIZE_ ## THINGY, \ .read = pyra_sysfs_read_ ## thingy, \ } #define PYRA_BIN_ATTRIBUTE_W(thingy, THINGY) \ PYRA_SYSFS_W(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0220 }, \ .size = PYRA_SIZE_ ## THINGY, \ .write = pyra_sysfs_write_ ## thingy \ } PYRA_BIN_ATTRIBUTE_W(control, CONTROL); PYRA_BIN_ATTRIBUTE_RW(info, INFO); PYRA_BIN_ATTRIBUTE_RW(profile_settings, PROFILE_SETTINGS); PYRA_BIN_ATTRIBUTE_RW(profile_buttons, PROFILE_BUTTONS); static ssize_t pyra_sysfs_read_profilex_settings(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = pyra_send_control(usb_dev, *(uint *)(attr->private), PYRA_CONTROL_REQUEST_PROFILE_SETTINGS); if (retval) return retval; return pyra_sysfs_read(fp, kobj, buf, off, count, PYRA_SIZE_PROFILE_SETTINGS, PYRA_COMMAND_PROFILE_SETTINGS); } static ssize_t pyra_sysfs_read_profilex_buttons(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = pyra_send_control(usb_dev, *(uint *)(attr->private), PYRA_CONTROL_REQUEST_PROFILE_BUTTONS); if (retval) return retval; return pyra_sysfs_read(fp, kobj, buf, off, count, PYRA_SIZE_PROFILE_BUTTONS, PYRA_COMMAND_PROFILE_BUTTONS); } #define PROFILE_ATTR(number) \ static struct bin_attribute bin_attr_profile##number##_settings = { \ .attr = { .name = "profile" #number "_settings", .mode = 0440 }, \ .size = PYRA_SIZE_PROFILE_SETTINGS, \ .read = pyra_sysfs_read_profilex_settings, \ .private = &profile_numbers[number-1], \ }; \ static struct bin_attribute bin_attr_profile##number##_buttons = { \ .attr = { .name = "profile" #number "_buttons", .mode = 0440 }, \ .size = PYRA_SIZE_PROFILE_BUTTONS, \ .read = pyra_sysfs_read_profilex_buttons, \ .private = &profile_numbers[number-1], \ }; PROFILE_ATTR(1); PROFILE_ATTR(2); PROFILE_ATTR(3); PROFILE_ATTR(4); PROFILE_ATTR(5); static ssize_t pyra_sysfs_write_settings(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval = 0; struct pyra_roccat_report roccat_report; struct pyra_settings const *settings; if (off != 0 || count != PYRA_SIZE_SETTINGS) return -EINVAL; settings = (struct pyra_settings const *)buf; if (settings->startup_profile >= ARRAY_SIZE(pyra->profile_settings)) return -EINVAL; mutex_lock(&pyra->pyra_lock); retval = pyra_set_settings(usb_dev, settings); if (retval) { mutex_unlock(&pyra->pyra_lock); return retval; } profile_activated(pyra, settings->startup_profile); roccat_report.type = PYRA_MOUSE_EVENT_BUTTON_TYPE_PROFILE_2; roccat_report.value = settings->startup_profile + 1; roccat_report.key = 0; roccat_report_event(pyra->chrdev_minor, (uint8_t const *)&roccat_report); mutex_unlock(&pyra->pyra_lock); return PYRA_SIZE_SETTINGS; } PYRA_SYSFS_R(settings, SETTINGS); static struct bin_attribute bin_attr_settings = __BIN_ATTR(settings, (S_IWUSR | S_IRUGO), pyra_sysfs_read_settings, pyra_sysfs_write_settings, PYRA_SIZE_SETTINGS); static ssize_t pyra_sysfs_show_actual_cpi(struct device *dev, struct device_attribute *attr, char *buf) { struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return snprintf(buf, PAGE_SIZE, "%d\n", pyra->actual_cpi); } static DEVICE_ATTR(actual_cpi, 0440, pyra_sysfs_show_actual_cpi, NULL); static ssize_t pyra_sysfs_show_actual_profile(struct device *dev, struct device_attribute *attr, char *buf) { struct pyra_device *pyra = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); struct pyra_settings settings; mutex_lock(&pyra->pyra_lock); roccat_common2_receive(usb_dev, PYRA_COMMAND_SETTINGS, &settings, PYRA_SIZE_SETTINGS); mutex_unlock(&pyra->pyra_lock); return snprintf(buf, PAGE_SIZE, "%d\n", settings.startup_profile); } static DEVICE_ATTR(actual_profile, 0440, pyra_sysfs_show_actual_profile, NULL); static DEVICE_ATTR(startup_profile, 0440, pyra_sysfs_show_actual_profile, NULL); static ssize_t pyra_sysfs_show_firmware_version(struct device *dev, struct device_attribute *attr, char *buf) { struct pyra_device *pyra; struct usb_device *usb_dev; struct pyra_info info; dev = dev->parent->parent; pyra = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); mutex_lock(&pyra->pyra_lock); roccat_common2_receive(usb_dev, PYRA_COMMAND_INFO, &info, PYRA_SIZE_INFO); mutex_unlock(&pyra->pyra_lock); return snprintf(buf, PAGE_SIZE, "%d\n", info.firmware_version); } static DEVICE_ATTR(firmware_version, 0440, pyra_sysfs_show_firmware_version, NULL); static struct attribute *pyra_attrs[] = { &dev_attr_actual_cpi.attr, &dev_attr_actual_profile.attr, &dev_attr_firmware_version.attr, &dev_attr_startup_profile.attr, NULL, }; static struct bin_attribute *pyra_bin_attributes[] = { &bin_attr_control, &bin_attr_info, &bin_attr_profile_settings, &bin_attr_profile_buttons, &bin_attr_settings, &bin_attr_profile1_settings, &bin_attr_profile2_settings, &bin_attr_profile3_settings, &bin_attr_profile4_settings, &bin_attr_profile5_settings, &bin_attr_profile1_buttons, &bin_attr_profile2_buttons, &bin_attr_profile3_buttons, &bin_attr_profile4_buttons, &bin_attr_profile5_buttons, NULL, }; static const struct attribute_group pyra_group = { .attrs = pyra_attrs, .bin_attrs = pyra_bin_attributes, }; static const struct attribute_group *pyra_groups[] = { &pyra_group, NULL, }; /* pyra_class is used for creating sysfs attributes via roccat char device */ static const struct class pyra_class = { .name = "pyra", .dev_groups = pyra_groups, }; static int pyra_init_pyra_device_struct(struct usb_device *usb_dev, struct pyra_device *pyra) { struct pyra_settings settings; int retval, i; mutex_init(&pyra->pyra_lock); retval = pyra_get_settings(usb_dev, &settings); if (retval) return retval; for (i = 0; i < 5; ++i) { retval = pyra_get_profile_settings(usb_dev, &pyra->profile_settings[i], i); if (retval) return retval; } profile_activated(pyra, settings.startup_profile); return 0; } static int pyra_init_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *usb_dev = interface_to_usbdev(intf); struct pyra_device *pyra; int retval; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { pyra = kzalloc(sizeof(*pyra), GFP_KERNEL); if (!pyra) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } hid_set_drvdata(hdev, pyra); retval = pyra_init_pyra_device_struct(usb_dev, pyra); if (retval) { hid_err(hdev, "couldn't init struct pyra_device\n"); goto exit_free; } retval = roccat_connect(&pyra_class, hdev, sizeof(struct pyra_roccat_report)); if (retval < 0) { hid_err(hdev, "couldn't init char dev\n"); } else { pyra->chrdev_minor = retval; pyra->roccat_claimed = 1; } } else { hid_set_drvdata(hdev, NULL); } return 0; exit_free: kfree(pyra); return retval; } static void pyra_remove_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct pyra_device *pyra; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { pyra = hid_get_drvdata(hdev); if (pyra->roccat_claimed) roccat_disconnect(pyra->chrdev_minor); kfree(hid_get_drvdata(hdev)); } } static int pyra_probe(struct hid_device *hdev, const struct hid_device_id *id) { int retval; if (!hid_is_usb(hdev)) return -EINVAL; retval = hid_parse(hdev); if (retval) { hid_err(hdev, "parse failed\n"); goto exit; } retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (retval) { hid_err(hdev, "hw start failed\n"); goto exit; } retval = pyra_init_specials(hdev); if (retval) { hid_err(hdev, "couldn't install mouse\n"); goto exit_stop; } return 0; exit_stop: hid_hw_stop(hdev); exit: return retval; } static void pyra_remove(struct hid_device *hdev) { pyra_remove_specials(hdev); hid_hw_stop(hdev); } static void pyra_keep_values_up_to_date(struct pyra_device *pyra, u8 const *data) { struct pyra_mouse_event_button const *button_event; switch (data[0]) { case PYRA_MOUSE_REPORT_NUMBER_BUTTON: button_event = (struct pyra_mouse_event_button const *)data; switch (button_event->type) { case PYRA_MOUSE_EVENT_BUTTON_TYPE_PROFILE_2: profile_activated(pyra, button_event->data1 - 1); break; case PYRA_MOUSE_EVENT_BUTTON_TYPE_CPI: pyra->actual_cpi = button_event->data1; break; } break; } } static void pyra_report_to_chrdev(struct pyra_device const *pyra, u8 const *data) { struct pyra_roccat_report roccat_report; struct pyra_mouse_event_button const *button_event; if (data[0] != PYRA_MOUSE_REPORT_NUMBER_BUTTON) return; button_event = (struct pyra_mouse_event_button const *)data; switch (button_event->type) { case PYRA_MOUSE_EVENT_BUTTON_TYPE_PROFILE_2: case PYRA_MOUSE_EVENT_BUTTON_TYPE_CPI: roccat_report.type = button_event->type; roccat_report.value = button_event->data1; roccat_report.key = 0; roccat_report_event(pyra->chrdev_minor, (uint8_t const *)&roccat_report); break; case PYRA_MOUSE_EVENT_BUTTON_TYPE_MACRO: case PYRA_MOUSE_EVENT_BUTTON_TYPE_SHORTCUT: case PYRA_MOUSE_EVENT_BUTTON_TYPE_QUICKLAUNCH: if (button_event->data2 == PYRA_MOUSE_EVENT_BUTTON_PRESS) { roccat_report.type = button_event->type; roccat_report.key = button_event->data1; /* * pyra reports profile numbers with range 1-5. * Keeping this behaviour. */ roccat_report.value = pyra->actual_profile + 1; roccat_report_event(pyra->chrdev_minor, (uint8_t const *)&roccat_report); } break; } } static int pyra_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct pyra_device *pyra = hid_get_drvdata(hdev); if (intf->cur_altsetting->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_MOUSE) return 0; if (pyra == NULL) return 0; pyra_keep_values_up_to_date(pyra, data); if (pyra->roccat_claimed) pyra_report_to_chrdev(pyra, data); return 0; } static const struct hid_device_id pyra_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_PYRA_WIRED) }, { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_PYRA_WIRELESS) }, { } }; MODULE_DEVICE_TABLE(hid, pyra_devices); static struct hid_driver pyra_driver = { .name = "pyra", .id_table = pyra_devices, .probe = pyra_probe, .remove = pyra_remove, .raw_event = pyra_raw_event }; static int __init pyra_init(void) { int retval; /* class name has to be same as driver name */ retval = class_register(&pyra_class); if (retval) return retval; retval = hid_register_driver(&pyra_driver); if (retval) class_unregister(&pyra_class); return retval; } static void __exit pyra_exit(void) { hid_unregister_driver(&pyra_driver); class_unregister(&pyra_class); } module_init(pyra_init); module_exit(pyra_exit); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat Pyra driver"); MODULE_LICENSE("GPL v2");
3730 235 235 51 230 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <linux/pid.h> #include <linux/sem.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex.h> #include <linux/plist.h> #include <linux/hrtimer.h> #include <linux/irqflags.h> #include <linux/seccomp.h> #include <linux/nodemask.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/syscall_user_dispatch.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers.h> #include <linux/rseq.h> #include <linux/seqlock.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/livepatch_sched.h> #include <asm/kmap_size.h> /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct user_event_mm; /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ debug_special_state_change((state_value)); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ debug_rtlock_wait_restore_state(); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; #endif struct sched_param { int sched_priority; }; struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /** * struct util_est - Estimation utilization of FAIR tasks * @enqueued: instantaneous estimated utilization of a task/cpu * @ewma: the Exponential Weighted Moving Average (EWMA) * utilization of a task * * Support data structure to track an Exponential Weighted Moving Average * (EWMA) of a FAIR task's utilization. New samples are added to the moving * average each time a task completes an activation. Sample's weight is chosen * so that the EWMA will be relatively insensitive to transient changes to the * task's workload. * * The enqueued attribute has a slightly different meaning for tasks and cpus: * - task: the task's util_avg at last task dequeue time * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU * Thus, the util_est.enqueued of a task represents the contribution on the * estimated utilization of the CPU where that task is currently enqueued. * * Only for tasks we track a moving average of the past instantaneous * estimated utilization. This allows to absorb sporadic drops in utilization * of an otherwise almost periodic task. * * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est.enqueued at dequeue * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg * for a task) it is safe to use MSB. */ struct util_est { unsigned int enqueued; unsigned int ewma; #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 } __attribute__((__aligned__(sizeof(u64)))); /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; struct util_est util_est; } ____cacheline_aligned; struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; u64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_deadline; struct list_head group_node; unsigned int on_rq; u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; s64 vlag; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif #ifdef CONFIG_SMP /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; #endif }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; /* saved state for "spinlock sleepers" */ unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; #endif int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; #ifdef CONFIG_SMP unsigned short migration_disabled; #endif unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; struct list_head trc_blkd_node; int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm; struct mm_struct *active_mm; struct address_space *faults_disabled_mapping; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; #ifdef CONFIG_RT_MUTEXES unsigned sched_rt_mutex:1; #endif /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_IOMMU_SVA unsigned pasid_activated:1; #endif #ifdef CONFIG_CPU_SUP_INTEL unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized setup_new_exec() * - access it with [gs]et_task_comm() * - lock it with task_lock() */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif #ifdef CONFIG_DEBUG_MUTEXES /* Mutex deadlock detection: */ struct mutex_waiter *blocked_on; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; #endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ int last_mm_cid; /* Most recent cid in mm */ int migrate_from_cpu; int mm_cid_active; /* Whether cid bitmap is active */ struct callback_head cid_work; #endif struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; #endif #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS. * If we find justification for more monitors, we can think * about adding more or developing a dynamic method. So far, * none of these are justified. */ union rv_task_monitor rv[RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end /* CPU-specific state of this task: */ struct thread_struct thread; /* * WARNING: on x86, 'thread_struct' contains a variable-sized * structure. It *MUST* be at the end of 'task_struct'. * * Do not put anything below here! */ }; static inline struct pid *task_pid(struct task_struct *task) { return task->thread_pid; } /* * the helpers to get the task's different pids as they are seen * from various namespaces * * task_xid_nr() : global id, i.e. the id seen from the init namespace; * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * task_xid_nr_ns() : id seen from the ns specified; * * see also pid_nr() etc in include/linux/pid.h */ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; } static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); } static inline pid_t task_pid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); } static inline pid_t task_tgid_nr(struct task_struct *tsk) { return tsk->tgid; } /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. * * Test if a process is not yet dead (at most zombie state) * If pid_alive fails, then pointers within the task structure * can be stale and must not be dereferenced. * * Return: 1 if the process is alive. 0 otherwise. */ static inline int pid_alive(const struct task_struct *p) { return p->thread_pid != NULL; } static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); } static inline pid_t task_pgrp_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); } static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); } static inline pid_t task_session_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); } static inline pid_t task_tgid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); } static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) { pid_t pid = 0; rcu_read_lock(); if (pid_alive(tsk)) pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); rcu_read_unlock(); return pid; } static inline pid_t task_ppid_nr(const struct task_struct *tsk) { return task_ppid_nr_ns(tsk, &init_pid_ns); } /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { return task_pgrp_nr_ns(tsk, &init_pid_ns); } #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. */ if (tsk_state & TASK_RTLOCK_WAIT) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } /** * is_global_init - check if a task structure is init. Since init * is free to have sub-threads we need to check tgid. * @tsk: Task structure to be checked. * * Check if a task structure is the first user space task the kernel created. * * Return: 1 if the task structure is init. 0 otherwise. */ static inline int is_global_init(struct task_struct *tsk) { return task_tgid_nr(tsk) == 1; } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ #define PF__HOLE__20000000 0x20000000 #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { #ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); #else return true; #endif } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); #ifdef CONFIG_SMP /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { } static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { if (!cpumask_test_cpu(0, new_mask)) return -EINVAL; return 0; } static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { if (src->user_cpus_ptr) return -EINVAL; return 0; } static inline void release_user_cpus_ptr(struct task_struct *p) { WARN_ON(p->user_cpus_ptr); } static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { return 0; } #endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { #ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK struct task_struct task; #endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #elif !defined(__HAVE_THREAD_FUNCTIONS) # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); static inline void set_task_comm(struct task_struct *tsk, const char *from) { __set_task_comm(tsk, from, false); } extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ __get_task_comm(buf, sizeof(buf), tsk); \ }) #ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } #else static inline void scheduler_ipi(void) { } #endif extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) void sched_dynamic_klp_enable(void); void sched_dynamic_klp_disable(void); DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { klp_sched_try_switch(); return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { klp_sched_try_switch(); return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) rcu_read_unlock(); cond_resched(); rcu_read_lock(); #endif } #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); #else static inline bool preempt_model_none(void) { return IS_ENABLED(CONFIG_PREEMPT_NONE); } static inline bool preempt_model_voluntary(void) { return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); } static inline bool preempt_model_full(void) { return IS_ENABLED(CONFIG_PREEMPT); } #endif static inline bool preempt_model_rt(void) { return IS_ENABLED(CONFIG_PREEMPT_RT); } /* * Does the preemption model allow non-cooperative preemption? * * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the * PREEMPT_NONE model. */ static inline bool preempt_model_preemptible(void) { return preempt_model_full() || preempt_model_rt(); } /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, * but a general need for low latency) */ static inline int spin_needbreak(spinlock_t *lock) { #ifdef CONFIG_PREEMPTION return spin_is_contended(lock); #else return 0; #endif } /* * Check if a rwlock is contended. * Returns non-zero if there is another task waiting on the rwlock. * Returns zero if the lock is not contended or the system / underlying * rwlock implementation does not support contention detection. * Technically does not depend on CONFIG_PREEMPTION, but a general need * for low latency. */ static inline int rwlock_needbreak(rwlock_t *lock) { #ifdef CONFIG_PREEMPTION return rwlock_is_contended(lock); #else return 0; #endif } static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif #ifdef CONFIG_SMP static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #endif /* CONFIG_SMP */ #ifdef CONFIG_RSEQ /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. */ enum rseq_event_mask_bits { RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, }; enum rseq_event_mask { RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), }; static inline void rseq_set_notify_resume(struct task_struct *t) { if (t->rseq) set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) __rseq_handle_notify_resume(ksig, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { preempt_disable(); __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask); preempt_enable(); rseq_handle_notify_resume(ksig, regs); } /* rseq_preempt() requires preemption to be disabled. */ static inline void rseq_preempt(struct task_struct *t) { __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* rseq_migrate() requires preemption to be disabled. */ static inline void rseq_migrate(struct task_struct *t) { __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } } static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } #else static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } #endif #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); #else static inline void rseq_syscall(struct pt_regs *regs) { } #endif #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #endif
2 1 2 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* * linux/fs/nls/nls_iso8859-4.c * * Charset iso8859-4 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, /* 0xb0*/ 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, /* 0xc0*/ 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, /* 0xd0*/ 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, /* 0xe0*/ 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, /* 0xf0*/ 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ 0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0x00, /* 0xc0-0xc7 */ 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0x00, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0x00, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0x00, /* 0xe0-0xe7 */ 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0x00, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0xc0, 0xe0, 0x00, 0x00, 0xa1, 0xb1, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */ 0xd0, 0xf0, 0xaa, 0xba, 0x00, 0x00, 0xcc, 0xec, /* 0x10-0x17 */ 0xca, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xab, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0xa5, 0xb5, 0xcf, 0xef, 0x00, 0x00, 0xc7, 0xe7, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd3, 0xf3, /* 0x30-0x37 */ 0xa2, 0x00, 0x00, 0xa6, 0xb6, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0xd1, 0xf1, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0xbd, 0xbf, 0xd2, 0xf2, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa3, 0xb3, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xa9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0xac, 0xbc, /* 0x60-0x67 */ 0xdd, 0xfd, 0xde, 0xfe, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0xd9, 0xf9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0xae, 0xbe, 0x00, /* 0x78-0x7f */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0xff, 0x00, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "iso8859-4", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_iso8859_4(void) { return register_nls(&table); } static void __exit exit_nls_iso8859_4(void) { unregister_nls(&table); } module_init(init_nls_iso8859_4) module_exit(exit_nls_iso8859_4) MODULE_LICENSE("Dual BSD/GPL");
100 12 590 590 583 585 588 556 19 11 19 15 3 590 557 3 53 557 1 48 571 567 15 9 589 555 15 158 68 30 2 32 32 27 10 10 539 541 17 572 561 4 541 229 10 176 10 12 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 // SPDX-License-Identifier: GPL-2.0-or-later /* * Scatterlist Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 David S. Miller (davem@redhat.com) * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * Portions derived from Cryptoapi, by Alexander Kjeldaas <astor@fast.no> * and Nettle, by Niels Möller. */ #include <linux/err.h> #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/module.h> #include <linux/param.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/completion.h> #include "internal.h" LIST_HEAD(crypto_alg_list); EXPORT_SYMBOL_GPL(crypto_alg_list); DECLARE_RWSEM(crypto_alg_sem); EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); #ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS DEFINE_STATIC_KEY_FALSE(__crypto_boot_test_finished); EXPORT_SYMBOL_GPL(__crypto_boot_test_finished); #endif static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg); struct crypto_alg *crypto_mod_get(struct crypto_alg *alg) { return try_module_get(alg->cra_module) ? crypto_alg_get(alg) : NULL; } EXPORT_SYMBOL_GPL(crypto_mod_get); void crypto_mod_put(struct crypto_alg *alg) { struct module *module = alg->cra_module; crypto_alg_put(alg); module_put(module); } EXPORT_SYMBOL_GPL(crypto_mod_put); static struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *q, *alg = NULL; int best = -2; list_for_each_entry(q, &crypto_alg_list, cra_list) { int exact, fuzzy; if (crypto_is_moribund(q)) continue; if ((q->cra_flags ^ type) & mask) continue; if (crypto_is_larval(q) && !crypto_is_test_larval((struct crypto_larval *)q) && ((struct crypto_larval *)q)->mask != mask) continue; exact = !strcmp(q->cra_driver_name, name); fuzzy = !strcmp(q->cra_name, name); if (!exact && !(fuzzy && q->cra_priority > best)) continue; if (unlikely(!crypto_mod_get(q))) continue; best = q->cra_priority; if (alg) crypto_mod_put(alg); alg = q; if (exact) break; } return alg; } static void crypto_larval_destroy(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; BUG_ON(!crypto_is_larval(alg)); if (!IS_ERR_OR_NULL(larval->adult)) crypto_mod_put(larval->adult); kfree(larval); } struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask) { struct crypto_larval *larval; larval = kzalloc(sizeof(*larval), GFP_KERNEL); if (!larval) return ERR_PTR(-ENOMEM); larval->mask = mask; larval->alg.cra_flags = CRYPTO_ALG_LARVAL | type; larval->alg.cra_priority = -1; larval->alg.cra_destroy = crypto_larval_destroy; strscpy(larval->alg.cra_name, name, CRYPTO_MAX_ALG_NAME); init_completion(&larval->completion); return larval; } EXPORT_SYMBOL_GPL(crypto_larval_alloc); static struct crypto_alg *crypto_larval_add(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_larval *larval; larval = crypto_larval_alloc(name, type, mask); if (IS_ERR(larval)) return ERR_CAST(larval); refcount_set(&larval->alg.cra_refcnt, 2); down_write(&crypto_alg_sem); alg = __crypto_alg_lookup(name, type, mask); if (!alg) { alg = &larval->alg; list_add(&alg->cra_list, &crypto_alg_list); } up_write(&crypto_alg_sem); if (alg != &larval->alg) { kfree(larval); if (crypto_is_larval(alg)) alg = crypto_larval_wait(alg); } return alg; } void crypto_larval_kill(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; down_write(&crypto_alg_sem); list_del(&alg->cra_list); up_write(&crypto_alg_sem); complete_all(&larval->completion); crypto_alg_put(alg); } EXPORT_SYMBOL_GPL(crypto_larval_kill); void crypto_wait_for_test(struct crypto_larval *larval) { int err; err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult); if (WARN_ON_ONCE(err != NOTIFY_STOP)) goto out; err = wait_for_completion_killable(&larval->completion); WARN_ON(err); out: crypto_larval_kill(&larval->alg); } EXPORT_SYMBOL_GPL(crypto_wait_for_test); static void crypto_start_test(struct crypto_larval *larval) { if (!crypto_is_test_larval(larval)) return; if (larval->test_started) return; down_write(&crypto_alg_sem); if (larval->test_started) { up_write(&crypto_alg_sem); return; } larval->test_started = true; up_write(&crypto_alg_sem); crypto_wait_for_test(larval); } static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg) { struct crypto_larval *larval = (void *)alg; long timeout; if (!crypto_boot_test_finished()) crypto_start_test(larval); timeout = wait_for_completion_killable_timeout( &larval->completion, 60 * HZ); alg = larval->adult; if (timeout < 0) alg = ERR_PTR(-EINTR); else if (!timeout) alg = ERR_PTR(-ETIMEDOUT); else if (!alg) alg = ERR_PTR(-ENOENT); else if (IS_ERR(alg)) ; else if (crypto_is_test_larval(larval) && !(alg->cra_flags & CRYPTO_ALG_TESTED)) alg = ERR_PTR(-EAGAIN); else if (alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL) alg = ERR_PTR(-EAGAIN); else if (!crypto_mod_get(alg)) alg = ERR_PTR(-EAGAIN); crypto_mod_put(&larval->alg); return alg; } static struct crypto_alg *crypto_alg_lookup(const char *name, u32 type, u32 mask) { const u32 fips = CRYPTO_ALG_FIPS_INTERNAL; struct crypto_alg *alg; u32 test = 0; if (!((type | mask) & CRYPTO_ALG_TESTED)) test |= CRYPTO_ALG_TESTED; down_read(&crypto_alg_sem); alg = __crypto_alg_lookup(name, (type | test) & ~fips, (mask | test) & ~fips); if (alg) { if (((type | mask) ^ fips) & fips) mask |= fips; mask &= fips; if (!crypto_is_larval(alg) && ((type ^ alg->cra_flags) & mask)) { /* Algorithm is disallowed in FIPS mode. */ crypto_mod_put(alg); alg = ERR_PTR(-ENOENT); } } else if (test) { alg = __crypto_alg_lookup(name, type, mask); if (alg && !crypto_is_larval(alg)) { /* Test failed */ crypto_mod_put(alg); alg = ERR_PTR(-ELIBBAD); } } up_read(&crypto_alg_sem); return alg; } static struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; if (!name) return ERR_PTR(-ENOENT); type &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); mask &= ~(CRYPTO_ALG_LARVAL | CRYPTO_ALG_DEAD); alg = crypto_alg_lookup(name, type, mask); if (!alg && !(mask & CRYPTO_NOLOAD)) { request_module("crypto-%s", name); if (!((type ^ CRYPTO_ALG_NEED_FALLBACK) & mask & CRYPTO_ALG_NEED_FALLBACK)) request_module("crypto-%s-all", name); alg = crypto_alg_lookup(name, type, mask); } if (!IS_ERR_OR_NULL(alg) && crypto_is_larval(alg)) alg = crypto_larval_wait(alg); else if (!alg) alg = crypto_larval_add(name, type, mask); return alg; } int crypto_probing_notify(unsigned long val, void *v) { int ok; ok = blocking_notifier_call_chain(&crypto_chain, val, v); if (ok == NOTIFY_DONE) { request_module("cryptomgr"); ok = blocking_notifier_call_chain(&crypto_chain, val, v); } return ok; } EXPORT_SYMBOL_GPL(crypto_probing_notify); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_alg *larval; int ok; /* * If the internal flag is set for a cipher, require a caller to * invoke the cipher with the internal flag to use that cipher. * Also, if a caller wants to allocate a cipher that may or may * not be an internal cipher, use type | CRYPTO_ALG_INTERNAL and * !(mask & CRYPTO_ALG_INTERNAL). */ if (!((type | mask) & CRYPTO_ALG_INTERNAL)) mask |= CRYPTO_ALG_INTERNAL; larval = crypto_larval_lookup(name, type, mask); if (IS_ERR(larval) || !crypto_is_larval(larval)) return larval; ok = crypto_probing_notify(CRYPTO_MSG_ALG_REQUEST, larval); if (ok == NOTIFY_STOP) alg = crypto_larval_wait(larval); else { crypto_mod_put(larval); alg = ERR_PTR(-ENOENT); } crypto_larval_kill(larval); return alg; } EXPORT_SYMBOL_GPL(crypto_alg_mod_lookup); static void crypto_exit_ops(struct crypto_tfm *tfm) { const struct crypto_type *type = tfm->__crt_alg->cra_type; if (type && tfm->exit) tfm->exit(tfm); } static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask) { const struct crypto_type *type_obj = alg->cra_type; unsigned int len; len = alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1); if (type_obj) return len + type_obj->ctxsize(alg, type, mask); switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) { default: BUG(); case CRYPTO_ALG_TYPE_CIPHER: len += crypto_cipher_ctxsize(alg); break; case CRYPTO_ALG_TYPE_COMPRESS: len += crypto_compress_ctxsize(alg); break; } return len; } void crypto_shoot_alg(struct crypto_alg *alg) { down_write(&crypto_alg_sem); alg->cra_flags |= CRYPTO_ALG_DYING; up_write(&crypto_alg_sem); } EXPORT_SYMBOL_GPL(crypto_shoot_alg); struct crypto_tfm *__crypto_alloc_tfmgfp(struct crypto_alg *alg, u32 type, u32 mask, gfp_t gfp) { struct crypto_tfm *tfm; unsigned int tfm_size; int err = -ENOMEM; tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, type, mask); tfm = kzalloc(tfm_size, gfp); if (tfm == NULL) goto out_err; tfm->__crt_alg = alg; refcount_set(&tfm->refcnt, 1); if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; goto out; cra_init_failed: crypto_exit_ops(tfm); if (err == -EAGAIN) crypto_shoot_alg(alg); kfree(tfm); out_err: tfm = ERR_PTR(err); out: return tfm; } EXPORT_SYMBOL_GPL(__crypto_alloc_tfmgfp); struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, u32 mask) { return __crypto_alloc_tfmgfp(alg, type, mask, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__crypto_alloc_tfm); /* * crypto_alloc_base - Locate algorithm and allocate transform * @alg_name: Name of algorithm * @type: Type of algorithm * @mask: Mask for type comparison * * This function should not be used by new algorithm types. * Please use crypto_alloc_tfm instead. * * crypto_alloc_base() will first attempt to locate an already loaded * algorithm. If that fails and the kernel supports dynamically loadable * modules, it will then attempt to load a module of the same name or * alias. If that fails it will send a query to any loaded crypto manager * to construct an algorithm on the fly. A refcount is grabbed on the * algorithm which is then associated with the new transform. * * The returned transform is of a non-determinate type. Most people * should use one of the more specific allocation functions such as * crypto_alloc_skcipher(). * * In case of error the return value is an error pointer. */ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask) { struct crypto_tfm *tfm; int err; for (;;) { struct crypto_alg *alg; alg = crypto_alg_mod_lookup(alg_name, type, mask); if (IS_ERR(alg)) { err = PTR_ERR(alg); goto err; } tfm = __crypto_alloc_tfm(alg, type, mask); if (!IS_ERR(tfm)) return tfm; crypto_mod_put(alg); err = PTR_ERR(tfm); err: if (err != -EAGAIN) break; if (fatal_signal_pending(current)) { err = -EINTR; break; } } return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_base); static void *crypto_alloc_tfmmem(struct crypto_alg *alg, const struct crypto_type *frontend, int node, gfp_t gfp) { struct crypto_tfm *tfm; unsigned int tfmsize; unsigned int total; char *mem; tfmsize = frontend->tfmsize; total = tfmsize + sizeof(*tfm) + frontend->extsize(alg); mem = kzalloc_node(total, gfp, node); if (mem == NULL) return ERR_PTR(-ENOMEM); tfm = (struct crypto_tfm *)(mem + tfmsize); tfm->__crt_alg = alg; tfm->node = node; refcount_set(&tfm->refcnt, 1); return mem; } void *crypto_create_tfm_node(struct crypto_alg *alg, const struct crypto_type *frontend, int node) { struct crypto_tfm *tfm; char *mem; int err; mem = crypto_alloc_tfmmem(alg, frontend, node, GFP_KERNEL); if (IS_ERR(mem)) goto out; tfm = (struct crypto_tfm *)(mem + frontend->tfmsize); err = frontend->init_tfm(tfm); if (err) goto out_free_tfm; if (!tfm->exit && alg->cra_init && (err = alg->cra_init(tfm))) goto cra_init_failed; goto out; cra_init_failed: crypto_exit_ops(tfm); out_free_tfm: if (err == -EAGAIN) crypto_shoot_alg(alg); kfree(mem); mem = ERR_PTR(err); out: return mem; } EXPORT_SYMBOL_GPL(crypto_create_tfm_node); void *crypto_clone_tfm(const struct crypto_type *frontend, struct crypto_tfm *otfm) { struct crypto_alg *alg = otfm->__crt_alg; struct crypto_tfm *tfm; char *mem; mem = ERR_PTR(-ESTALE); if (unlikely(!crypto_mod_get(alg))) goto out; mem = crypto_alloc_tfmmem(alg, frontend, otfm->node, GFP_ATOMIC); if (IS_ERR(mem)) { crypto_mod_put(alg); goto out; } tfm = (struct crypto_tfm *)(mem + frontend->tfmsize); tfm->crt_flags = otfm->crt_flags; tfm->exit = otfm->exit; out: return mem; } EXPORT_SYMBOL_GPL(crypto_clone_tfm); struct crypto_alg *crypto_find_alg(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask) { if (frontend) { type &= frontend->maskclear; mask &= frontend->maskclear; type |= frontend->type; mask |= frontend->maskset; } return crypto_alg_mod_lookup(alg_name, type, mask); } EXPORT_SYMBOL_GPL(crypto_find_alg); /* * crypto_alloc_tfm_node - Locate algorithm and allocate transform * @alg_name: Name of algorithm * @frontend: Frontend algorithm type * @type: Type of algorithm * @mask: Mask for type comparison * @node: NUMA node in which users desire to put requests, if node is * NUMA_NO_NODE, it means users have no special requirement. * * crypto_alloc_tfm() will first attempt to locate an already loaded * algorithm. If that fails and the kernel supports dynamically loadable * modules, it will then attempt to load a module of the same name or * alias. If that fails it will send a query to any loaded crypto manager * to construct an algorithm on the fly. A refcount is grabbed on the * algorithm which is then associated with the new transform. * * The returned transform is of a non-determinate type. Most people * should use one of the more specific allocation functions such as * crypto_alloc_skcipher(). * * In case of error the return value is an error pointer. */ void *crypto_alloc_tfm_node(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask, int node) { void *tfm; int err; for (;;) { struct crypto_alg *alg; alg = crypto_find_alg(alg_name, frontend, type, mask); if (IS_ERR(alg)) { err = PTR_ERR(alg); goto err; } tfm = crypto_create_tfm_node(alg, frontend, node); if (!IS_ERR(tfm)) return tfm; crypto_mod_put(alg); err = PTR_ERR(tfm); err: if (err != -EAGAIN) break; if (fatal_signal_pending(current)) { err = -EINTR; break; } } return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_tfm_node); /* * crypto_destroy_tfm - Free crypto transform * @mem: Start of tfm slab * @tfm: Transform to free * * This function frees up the transform and any associated resources, * then drops the refcount on the associated algorithm. */ void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm) { struct crypto_alg *alg; if (IS_ERR_OR_NULL(mem)) return; if (!refcount_dec_and_test(&tfm->refcnt)) return; alg = tfm->__crt_alg; if (!tfm->exit && alg->cra_exit) alg->cra_exit(tfm); crypto_exit_ops(tfm); crypto_mod_put(alg); kfree_sensitive(mem); } EXPORT_SYMBOL_GPL(crypto_destroy_tfm); int crypto_has_alg(const char *name, u32 type, u32 mask) { int ret = 0; struct crypto_alg *alg = crypto_alg_mod_lookup(name, type, mask); if (!IS_ERR(alg)) { crypto_mod_put(alg); ret = 1; } return ret; } EXPORT_SYMBOL_GPL(crypto_has_alg); void crypto_req_done(void *data, int err) { struct crypto_wait *wait = data; if (err == -EINPROGRESS) return; wait->err = err; complete(&wait->completion); } EXPORT_SYMBOL_GPL(crypto_req_done); MODULE_DESCRIPTION("Cryptographic core API"); MODULE_LICENSE("GPL");
90 10 10 6 4 10 78 78 39 37 70 30 31 1 103 104 15 10 20 21 16 5 22 36 30 23 22 8 36 17 1 1 1 1 132 132 17 31 30 6 30 1 30 1 31 30 20 1 1 2 6 3 3 3 3 2 4 2 4 5 1 2 4 4 2 6 6 6 2 4 2 4 4 2 6 3 3 2 1 3 1 2 2 2 2 100 3 1 92 7 1 99 1 95 2 1 45 52 97 95 2 3 102 66 35 1 100 2 1 94 2 94 3 5 1 85 4 1 80 80 1 78 1 78 78 78 78 78 78 70 37 3 3 2 3 67 75 75 31 32 32 11 9 2 10 1 102 106 103 3 102 4 106 1 105 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * * terminology * * cluster - allocation unit - 512,1K,2K,4K,...,2M * vcn - virtual cluster number - Offset inside the file in clusters. * vbo - virtual byte offset - Offset inside the file in bytes. * lcn - logical cluster number - 0 based cluster in clusters heap. * lbo - logical byte offset - Absolute position inside volume. * run - maps VCN to LCN - Stored in attributes in packed form. * attr - attribute segment - std/name/data etc records inside MFT. * mi - MFT inode - One MFT record(usually 1024 bytes or 4K), consists of attributes. * ni - NTFS inode - Extends linux inode. consists of one or more mft inodes. * index - unit inside directory - 2K, 4K, <=page size, does not depend on cluster size. * * WSL - Windows Subsystem for Linux * https://docs.microsoft.com/en-us/windows/wsl/file-permissions * It stores uid/gid/mode/dev in xattr * * ntfs allows up to 2^64 clusters per volume. * It means you should use 64 bits lcn to operate with ntfs. * Implementation of ntfs.sys uses only 32 bits lcn. * Default ntfs3 uses 32 bits lcn too. * ntfs3 built with CONFIG_NTFS3_64BIT_CLUSTER (ntfs3_64) uses 64 bits per lcn. * * * ntfs limits, cluster size is 4K (2^12) * ----------------------------------------------------------------------------- * | Volume size | Clusters | ntfs.sys | ntfs3 | ntfs3_64 | mkntfs | chkdsk | * ----------------------------------------------------------------------------- * | < 16T, 2^44 | < 2^32 | yes | yes | yes | yes | yes | * | > 16T, 2^44 | > 2^32 | no | no | yes | yes | yes | * ----------------------------------------------------------|------------------ * * To mount large volumes as ntfs one should use large cluster size (up to 2M) * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P * * ntfs limits, cluster size is 2M (2^21) * ----------------------------------------------------------------------------- * | < 8P, 2^53 | < 2^32 | yes | yes | yes | yes | yes | * | > 8P, 2^53 | > 2^32 | no | no | yes | yes | yes | * ----------------------------------------------------------|------------------ * */ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/log2.h> #include <linux/minmax.h> #include <linux/module.h> #include <linux/nls.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/statfs.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" #ifdef CONFIG_NTFS3_LZX_XPRESS #include "lib/lib.h" #endif #ifdef CONFIG_PRINTK /* * ntfs_printk - Trace warnings/notices/errors. * * Thanks Joe Perches <joe@perches.com> for implementation */ void ntfs_printk(const struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; int level; struct ntfs_sb_info *sbi = sb->s_fs_info; /* Should we use different ratelimits for warnings/notices/errors? */ if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3")) return; va_start(args, fmt); level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; printk("%c%cntfs3: %s: %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf); va_end(args); } static char s_name_buf[512]; static atomic_t s_name_buf_cnt = ATOMIC_INIT(1); // 1 means 'free s_name_buf'. /* * ntfs_inode_printk * * Print warnings/notices/errors about inode using name or inode number. */ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) { struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; char *name; va_list args; struct va_format vaf; int level; if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3")) return; /* Use static allocated buffer, if possible. */ name = atomic_dec_and_test(&s_name_buf_cnt) ? s_name_buf : kmalloc(sizeof(s_name_buf), GFP_NOFS); if (name) { struct dentry *de = d_find_alias(inode); const u32 name_len = ARRAY_SIZE(s_name_buf) - 1; if (de) { spin_lock(&de->d_lock); snprintf(name, name_len, " \"%s\"", de->d_name.name); spin_unlock(&de->d_lock); name[name_len] = 0; /* To be sure. */ } else { name[0] = 0; } dput(de); /* Cocci warns if placed in branch "if (de)" */ } va_start(args, fmt); level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; printk("%c%cntfs3: %s: ino=%lx,%s %pV\n", KERN_SOH_ASCII, level, sb->s_id, inode->i_ino, name ? name : "", &vaf); va_end(args); atomic_inc(&s_name_buf_cnt); if (name != s_name_buf) kfree(name); } #endif /* * Shared memory struct. * * On-disk ntfs's upcase table is created by ntfs formatter. * 'upcase' table is 128K bytes of memory. * We should read it into memory when mounting. * Several ntfs volumes likely use the same 'upcase' table. * It is good idea to share in-memory 'upcase' table between different volumes. * Unfortunately winxp/vista/win7 use different upcase tables. */ static DEFINE_SPINLOCK(s_shared_lock); static struct { void *ptr; u32 len; int cnt; } s_shared[8]; /* * ntfs_set_shared * * Return: * * @ptr - If pointer was saved in shared memory. * * NULL - If pointer was not shared. */ void *ntfs_set_shared(void *ptr, u32 bytes) { void *ret = NULL; int i, j = -1; spin_lock(&s_shared_lock); for (i = 0; i < ARRAY_SIZE(s_shared); i++) { if (!s_shared[i].cnt) { j = i; } else if (bytes == s_shared[i].len && !memcmp(s_shared[i].ptr, ptr, bytes)) { s_shared[i].cnt += 1; ret = s_shared[i].ptr; break; } } if (!ret && j != -1) { s_shared[j].ptr = ptr; s_shared[j].len = bytes; s_shared[j].cnt = 1; ret = ptr; } spin_unlock(&s_shared_lock); return ret; } /* * ntfs_put_shared * * Return: * * @ptr - If pointer is not shared anymore. * * NULL - If pointer is still shared. */ void *ntfs_put_shared(void *ptr) { void *ret = ptr; int i; spin_lock(&s_shared_lock); for (i = 0; i < ARRAY_SIZE(s_shared); i++) { if (s_shared[i].cnt && s_shared[i].ptr == ptr) { if (--s_shared[i].cnt) ret = NULL; break; } } spin_unlock(&s_shared_lock); return ret; } static inline void put_mount_options(struct ntfs_mount_options *options) { kfree(options->nls_name); unload_nls(options->nls); kfree(options); } enum Opt { Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_immutable, Opt_discard, Opt_force, Opt_sparse, Opt_nohidden, Opt_hide_dot_files, Opt_windows_names, Opt_showmeta, Opt_acl, Opt_iocharset, Opt_prealloc, Opt_nocase, Opt_err, }; // clang-format off static const struct fs_parameter_spec ntfs_fs_parameters[] = { fsparam_u32("uid", Opt_uid), fsparam_u32("gid", Opt_gid), fsparam_u32oct("umask", Opt_umask), fsparam_u32oct("dmask", Opt_dmask), fsparam_u32oct("fmask", Opt_fmask), fsparam_flag_no("sys_immutable", Opt_immutable), fsparam_flag_no("discard", Opt_discard), fsparam_flag_no("force", Opt_force), fsparam_flag_no("sparse", Opt_sparse), fsparam_flag_no("hidden", Opt_nohidden), fsparam_flag_no("hide_dot_files", Opt_hide_dot_files), fsparam_flag_no("windows_names", Opt_windows_names), fsparam_flag_no("showmeta", Opt_showmeta), fsparam_flag_no("acl", Opt_acl), fsparam_string("iocharset", Opt_iocharset), fsparam_flag_no("prealloc", Opt_prealloc), fsparam_flag_no("nocase", Opt_nocase), {} }; // clang-format on /* * Load nls table or if @nls is utf8 then return NULL. * * It is good idea to use here "const char *nls". * But load_nls accepts "char*". */ static struct nls_table *ntfs_load_nls(char *nls) { struct nls_table *ret; if (!nls) nls = CONFIG_NLS_DEFAULT; if (strcmp(nls, "utf8") == 0) return NULL; if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0) return load_nls_default(); ret = load_nls(nls); if (ret) return ret; return ERR_PTR(-EINVAL); } static int ntfs_fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ntfs_mount_options *opts = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, ntfs_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_uid: opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(opts->fs_uid)) return invalf(fc, "ntfs3: Invalid value for uid."); break; case Opt_gid: opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(opts->fs_gid)) return invalf(fc, "ntfs3: Invalid value for gid."); break; case Opt_umask: if (result.uint_32 & ~07777) return invalf(fc, "ntfs3: Invalid value for umask."); opts->fs_fmask_inv = ~result.uint_32; opts->fs_dmask_inv = ~result.uint_32; opts->fmask = 1; opts->dmask = 1; break; case Opt_dmask: if (result.uint_32 & ~07777) return invalf(fc, "ntfs3: Invalid value for dmask."); opts->fs_dmask_inv = ~result.uint_32; opts->dmask = 1; break; case Opt_fmask: if (result.uint_32 & ~07777) return invalf(fc, "ntfs3: Invalid value for fmask."); opts->fs_fmask_inv = ~result.uint_32; opts->fmask = 1; break; case Opt_immutable: opts->sys_immutable = result.negated ? 0 : 1; break; case Opt_discard: opts->discard = result.negated ? 0 : 1; break; case Opt_force: opts->force = result.negated ? 0 : 1; break; case Opt_sparse: opts->sparse = result.negated ? 0 : 1; break; case Opt_nohidden: opts->nohidden = result.negated ? 1 : 0; break; case Opt_hide_dot_files: opts->hide_dot_files = result.negated ? 0 : 1; break; case Opt_windows_names: opts->windows_names = result.negated ? 0 : 1; break; case Opt_showmeta: opts->showmeta = result.negated ? 0 : 1; break; case Opt_acl: if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL fc->sb_flags |= SB_POSIXACL; #else return invalf( fc, "ntfs3: Support for ACL not compiled in!"); #endif else fc->sb_flags &= ~SB_POSIXACL; break; case Opt_iocharset: kfree(opts->nls_name); opts->nls_name = param->string; param->string = NULL; break; case Opt_prealloc: opts->prealloc = result.negated ? 0 : 1; break; case Opt_nocase: opts->nocase = result.negated ? 1 : 0; break; default: /* Should not be here unless we forget add case. */ return -EINVAL; } return 0; } static int ntfs_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_mount_options *new_opts = fc->fs_private; int ro_rw; ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); return -EINVAL; } new_opts->nls = ntfs_load_nls(new_opts->nls_name); if (IS_ERR(new_opts->nls)) { new_opts->nls = NULL; errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); return -EINVAL; } if (new_opts->nls != sbi->options->nls) return invalf( fc, "ntfs3: Cannot use different iocharset when remounting!"); sync_filesystem(sb); if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && !new_opts->force) { errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); return -EINVAL; } swap(sbi->options, fc->fs_private); return 0; } #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_info_root; /* * ntfs3_volinfo: * * The content of /proc/fs/ntfs3/<dev>/volinfo * * ntfs3.1 * cluster size * number of clusters * total number of mft records * number of used mft records ~= number of files + folders * real state of ntfs "dirty"/"clean" * current state of ntfs "dirty"/"clean" */ static int ntfs3_volinfo(struct seq_file *m, void *o) { struct super_block *sb = m->private; struct ntfs_sb_info *sbi = sb->s_fs_info; seq_printf(m, "ntfs%d.%d\n%u\n%zu\n\%zu\n%zu\n%s\n%s\n", sbi->volume.major_ver, sbi->volume.minor_ver, sbi->cluster_size, sbi->used.bitmap.nbits, sbi->mft.bitmap.nbits, sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap), sbi->volume.real_dirty ? "dirty" : "clean", (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean"); return 0; } static int ntfs3_volinfo_open(struct inode *inode, struct file *file) { return single_open(file, ntfs3_volinfo, pde_data(inode)); } /* read /proc/fs/ntfs3/<dev>/label */ static int ntfs3_label_show(struct seq_file *m, void *o) { struct super_block *sb = m->private; struct ntfs_sb_info *sbi = sb->s_fs_info; seq_printf(m, "%s\n", sbi->volume.label); return 0; } /* write /proc/fs/ntfs3/<dev>/label */ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int err; struct super_block *sb = pde_data(file_inode(file)); ssize_t ret = count; u8 *label; if (sb_rdonly(sb)) return -EROFS; label = kmalloc(count, GFP_NOFS); if (!label) return -ENOMEM; if (copy_from_user(label, buffer, ret)) { ret = -EFAULT; goto out; } while (ret > 0 && label[ret - 1] == '\n') ret -= 1; err = ntfs_set_label(sb->s_fs_info, label, ret); if (err < 0) { ntfs_err(sb, "failed (%d) to write label", err); ret = err; goto out; } *ppos += count; ret = count; out: kfree(label); return ret; } static int ntfs3_label_open(struct inode *inode, struct file *file) { return single_open(file, ntfs3_label_show, pde_data(inode)); } static const struct proc_ops ntfs3_volinfo_fops = { .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_open = ntfs3_volinfo_open, }; static const struct proc_ops ntfs3_label_fops = { .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_open = ntfs3_label_open, .proc_write = ntfs3_label_write, }; #endif static struct kmem_cache *ntfs_inode_cachep; static struct inode *ntfs_alloc_inode(struct super_block *sb) { struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS); if (!ni) return NULL; memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode)); mutex_init(&ni->ni_lock); return &ni->vfs_inode; } static void ntfs_free_inode(struct inode *inode) { struct ntfs_inode *ni = ntfs_i(inode); mutex_destroy(&ni->ni_lock); kmem_cache_free(ntfs_inode_cachep, ni); } static void init_once(void *foo) { struct ntfs_inode *ni = foo; inode_init_once(&ni->vfs_inode); } /* * Noinline to reduce binary size. */ static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi) { wnd_close(&sbi->mft.bitmap); wnd_close(&sbi->used.bitmap); if (sbi->mft.ni) { iput(&sbi->mft.ni->vfs_inode); sbi->mft.ni = NULL; } if (sbi->security.ni) { iput(&sbi->security.ni->vfs_inode); sbi->security.ni = NULL; } if (sbi->reparse.ni) { iput(&sbi->reparse.ni->vfs_inode); sbi->reparse.ni = NULL; } if (sbi->objid.ni) { iput(&sbi->objid.ni->vfs_inode); sbi->objid.ni = NULL; } if (sbi->volume.ni) { iput(&sbi->volume.ni->vfs_inode); sbi->volume.ni = NULL; } ntfs_update_mftmirr(sbi, 0); indx_clear(&sbi->security.index_sii); indx_clear(&sbi->security.index_sdh); indx_clear(&sbi->reparse.index_r); indx_clear(&sbi->objid.index_o); } static void ntfs3_free_sbi(struct ntfs_sb_info *sbi) { kfree(sbi->new_rec); kvfree(ntfs_put_shared(sbi->upcase)); kfree(sbi->def_table); kfree(sbi->compress.lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS xpress_free_decompressor(sbi->compress.xpress); lzx_free_decompressor(sbi->compress.lzx); #endif kfree(sbi); } static void ntfs_put_super(struct super_block *sb) { struct ntfs_sb_info *sbi = sb->s_fs_info; #ifdef CONFIG_PROC_FS // Remove /proc/fs/ntfs3/.. if (sbi->procdir) { remove_proc_entry("label", sbi->procdir); remove_proc_entry("volinfo", sbi->procdir); remove_proc_entry(sb->s_id, proc_info_root); sbi->procdir = NULL; } #endif /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); ntfs3_put_sbi(sbi); } static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct wnd_bitmap *wnd = &sbi->used.bitmap; buf->f_type = sb->s_magic; buf->f_bsize = sbi->cluster_size; buf->f_blocks = wnd->nbits; buf->f_bfree = buf->f_bavail = wnd_zeroes(wnd); buf->f_fsid.val[0] = sbi->volume.ser_num; buf->f_fsid.val[1] = (sbi->volume.ser_num >> 32); buf->f_namelen = NTFS_NAME_LEN; return 0; } static int ntfs_show_options(struct seq_file *m, struct dentry *root) { struct super_block *sb = root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_mount_options *opts = sbi->options; struct user_namespace *user_ns = seq_user_ns(m); seq_printf(m, ",uid=%u", from_kuid_munged(user_ns, opts->fs_uid)); seq_printf(m, ",gid=%u", from_kgid_munged(user_ns, opts->fs_gid)); if (opts->dmask) seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff); if (opts->fmask) seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (opts->discard) seq_puts(m, ",discard"); if (opts->force) seq_puts(m, ",force"); if (opts->sparse) seq_puts(m, ",sparse"); if (opts->nohidden) seq_puts(m, ",nohidden"); if (opts->hide_dot_files) seq_puts(m, ",hide_dot_files"); if (opts->windows_names) seq_puts(m, ",windows_names"); if (opts->showmeta) seq_puts(m, ",showmeta"); if (sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); if (opts->nls) seq_printf(m, ",iocharset=%s", opts->nls->charset); else seq_puts(m, ",iocharset=utf8"); if (opts->prealloc) seq_puts(m, ",prealloc"); if (opts->nocase) seq_puts(m, ",nocase"); return 0; } /* * ntfs_sync_fs - super_operations::sync_fs */ static int ntfs_sync_fs(struct super_block *sb, int wait) { int err = 0, err2; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni; struct inode *inode; ni = sbi->security.ni; if (ni) { inode = &ni->vfs_inode; err2 = _ni_write_inode(inode, wait); if (err2 && !err) err = err2; } ni = sbi->objid.ni; if (ni) { inode = &ni->vfs_inode; err2 = _ni_write_inode(inode, wait); if (err2 && !err) err = err2; } ni = sbi->reparse.ni; if (ni) { inode = &ni->vfs_inode; err2 = _ni_write_inode(inode, wait); if (err2 && !err) err = err2; } if (!err) ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); ntfs_update_mftmirr(sbi, wait); return err; } static const struct super_operations ntfs_sops = { .alloc_inode = ntfs_alloc_inode, .free_inode = ntfs_free_inode, .evict_inode = ntfs_evict_inode, .put_super = ntfs_put_super, .statfs = ntfs_statfs, .show_options = ntfs_show_options, .sync_fs = ntfs_sync_fs, .write_inode = ntfs3_write_inode, }; static struct inode *ntfs_export_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct MFT_REF ref; struct inode *inode; ref.low = cpu_to_le32(ino); #ifdef CONFIG_NTFS3_64BIT_CLUSTER ref.high = cpu_to_le16(ino >> 32); #else ref.high = 0; #endif ref.seq = cpu_to_le16(generation); inode = ntfs_iget5(sb, &ref, NULL); if (!IS_ERR(inode) && is_bad_inode(inode)) { iput(inode); inode = ERR_PTR(-ESTALE); } return inode; } static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ntfs_export_get_inode); } static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ntfs_export_get_inode); } /* TODO: == ntfs_sync_inode */ static int ntfs_nfs_commit_metadata(struct inode *inode) { return _ni_write_inode(inode, 1); } static const struct export_operations ntfs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ntfs_fh_to_dentry, .fh_to_parent = ntfs_fh_to_parent, .get_parent = ntfs3_get_parent, .commit_metadata = ntfs_nfs_commit_metadata, }; /* * format_size_gb - Return Gb,Mb to print with "%u.%02u Gb". */ static u32 format_size_gb(const u64 bytes, u32 *mb) { /* Do simple right 30 bit shift of 64 bit value. */ u64 kbytes = bytes >> 10; u32 kbytes32 = kbytes; *mb = (100 * (kbytes32 & 0xfffff) + 0x7ffff) >> 20; if (*mb >= 100) *mb = 99; return (kbytes32 >> 20) | (((u32)(kbytes >> 32)) << 12); } static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) { if (boot->sectors_per_clusters <= 0x80) return boot->sectors_per_clusters; if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ return 1U << (-(s8)boot->sectors_per_clusters); return -EINVAL; } /* * ntfs_init_from_boot - Init internal info from on-disk boot sector. * * NTFS mount begins from boot - special formatted 512 bytes. * There are two boots: the first and the last 512 bytes of volume. * The content of boot is not changed during ntfs life. * * NOTE: ntfs.sys checks only first (primary) boot. * chkdsk checks both boots. */ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u64 dev_size, struct NTFS_BOOT **boot2) { struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; u64 sectors, clusters, mlcn, mlcn2, dev_size0; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; u16 fn, ao; u8 cluster_bits; u32 boot_off = 0; const char *hint = "Primary boot"; /* Save original dev_size. Used with alternative boot. */ dev_size0 = dev_size; sbi->volume.blocks = dev_size >> PAGE_SHIFT; bh = ntfs_bread(sb, 0); if (!bh) return -EIO; check_boot: err = -EINVAL; /* Corrupted image; do not read OOB */ if (bh->b_size - sizeof(*boot) < boot_off) goto out; boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off); if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { ntfs_err(sb, "%s signature is not NTFS.", hint); goto out; } /* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/ /*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1]) * goto out; */ boot_sector_size = ((u32)boot->bytes_per_sector[1] << 8) | boot->bytes_per_sector[0]; if (boot_sector_size < SECTOR_SIZE || !is_power_of_2(boot_sector_size)) { ntfs_err(sb, "%s: invalid bytes per sector %u.", hint, boot_sector_size); goto out; } /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); if ((int)sct_per_clst < 0 || !is_power_of_2(sct_per_clst)) { ntfs_err(sb, "%s: invalid sectors per cluster %u.", hint, sct_per_clst); goto out; } sbi->cluster_size = boot_sector_size * sct_per_clst; sbi->cluster_bits = cluster_bits = blksize_bits(sbi->cluster_size); sbi->cluster_mask = sbi->cluster_size - 1; sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; mlcn = le64_to_cpu(boot->mft_clst); mlcn2 = le64_to_cpu(boot->mft2_clst); sectors = le64_to_cpu(boot->sectors_per_volume); if (mlcn * sct_per_clst >= sectors || mlcn2 * sct_per_clst >= sectors) { ntfs_err( sb, "%s: start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.", hint, mlcn, mlcn2, sectors); goto out; } if (boot->record_size >= 0) { record_size = (u32)boot->record_size << cluster_bits; } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) { record_size = 1u << (-boot->record_size); } else { ntfs_err(sb, "%s: invalid record size %d.", hint, boot->record_size); goto out; } sbi->record_size = record_size; sbi->record_bits = blksize_bits(record_size); sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes /* Check MFT record size. */ if (record_size < SECTOR_SIZE || !is_power_of_2(record_size)) { ntfs_err(sb, "%s: invalid bytes per MFT record %u (%d).", hint, record_size, boot->record_size); goto out; } if (record_size > MAXIMUM_BYTES_PER_MFT) { ntfs_err(sb, "Unsupported bytes per MFT record %u.", record_size); goto out; } if (boot->index_size >= 0) { sbi->index_size = (u32)boot->index_size << cluster_bits; } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) { sbi->index_size = 1u << (-boot->index_size); } else { ntfs_err(sb, "%s: invalid index size %d.", hint, boot->index_size); goto out; } /* Check index record size. */ if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { ntfs_err(sb, "%s: invalid bytes per index %u(%d).", hint, sbi->index_size, boot->index_size); goto out; } if (sbi->index_size > MAXIMUM_BYTES_PER_INDEX) { ntfs_err(sb, "%s: unsupported bytes per index %u.", hint, sbi->index_size); goto out; } sbi->volume.size = sectors * boot_sector_size; gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb); /* * - Volume formatted and mounted with the same sector size. * - Volume formatted 4K and mounted as 512. * - Volume formatted 512 and mounted as 4K. */ if (boot_sector_size != sector_size) { ntfs_warn( sb, "Different NTFS sector size (%u) and media sector size (%u).", boot_sector_size, sector_size); dev_size += sector_size - 1; } sbi->mft.lbo = mlcn << cluster_bits; sbi->mft.lbo2 = mlcn2 << cluster_bits; /* Compare boot's cluster and sector. */ if (sbi->cluster_size < boot_sector_size) { ntfs_err(sb, "%s: invalid bytes per cluster (%u).", hint, sbi->cluster_size); goto out; } /* Compare boot's cluster and media sector. */ if (sbi->cluster_size < sector_size) { /* No way to use ntfs_get_block in this case. */ ntfs_err( sb, "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u).", sbi->cluster_size, sector_size); goto out; } sbi->max_bytes_per_attr = record_size - ALIGN(MFTRECORD_FIXUP_OFFSET, 8) - ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) - ALIGN(sizeof(enum ATTR_TYPE), 8); sbi->volume.ser_num = le64_to_cpu(boot->serial_num); /* Warning if RAW volume. */ if (dev_size < sbi->volume.size + boot_sector_size) { u32 mb0, gb0; gb0 = format_size_gb(dev_size, &mb0); ntfs_warn( sb, "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only.", gb, mb, gb0, mb0); sb->s_flags |= SB_RDONLY; } clusters = sbi->volume.size >> cluster_bits; #ifndef CONFIG_NTFS3_64BIT_CLUSTER /* 32 bits per cluster. */ if (clusters >> 32) { ntfs_notice( sb, "NTFS %u.%02u Gb is too big to use 32 bits per cluster.", gb, mb); goto out; } #elif BITS_PER_LONG < 64 #error "CONFIG_NTFS3_64BIT_CLUSTER incompatible in 32 bit OS" #endif sbi->used.bitmap.nbits = clusters; rec = kzalloc(record_size, GFP_NOFS); if (!rec) { err = -ENOMEM; goto out; } sbi->new_rec = rec; rec->rhdr.sign = NTFS_FILE_SIGNATURE; rec->rhdr.fix_off = cpu_to_le16(MFTRECORD_FIXUP_OFFSET); fn = (sbi->record_size >> SECTOR_SHIFT) + 1; rec->rhdr.fix_num = cpu_to_le16(fn); ao = ALIGN(MFTRECORD_FIXUP_OFFSET + sizeof(short) * fn, 8); rec->attr_off = cpu_to_le16(ao); rec->used = cpu_to_le32(ao + ALIGN(sizeof(enum ATTR_TYPE), 8)); rec->total = cpu_to_le32(sbi->record_size); ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE)); sbi->block_mask = sb->s_blocksize - 1; sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits; /* Maximum size for normal files. */ sbi->maxbytes = (clusters << cluster_bits) - 1; #ifdef CONFIG_NTFS3_64BIT_CLUSTER if (clusters >= (1ull << (64 - cluster_bits))) sbi->maxbytes = -1; sbi->maxbytes_sparse = -1; sb->s_maxbytes = MAX_LFS_FILESIZE; #else /* Maximum size for sparse file. */ sbi->maxbytes_sparse = (1ull << (cluster_bits + 32)) - 1; sb->s_maxbytes = 0xFFFFFFFFull << cluster_bits; #endif /* * Compute the MFT zone at two steps. * It would be nice if we are able to allocate 1/8 of * total clusters for MFT but not more then 512 MB. */ sbi->zone_max = min_t(CLST, 0x20000000 >> cluster_bits, clusters >> 3); err = 0; if (bh->b_blocknr && !sb_rdonly(sb)) { /* * Alternative boot is ok but primary is not ok. * Do not update primary boot here 'cause it may be faked boot. * Let ntfs to be mounted and update boot later. */ *boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN); } out: if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); u64 lbo = dev_size0 - sizeof(*boot); /* * Try alternative boot (last sector) */ brelse(bh); sb_set_blocksize(sb, block_size); bh = ntfs_bread(sb, lbo >> blksize_bits(block_size)); if (!bh) return -EINVAL; boot_off = lbo & (block_size - 1); hint = "Alternative boot"; dev_size = dev_size0; /* restore original size. */ goto check_boot; } brelse(bh); return err; } /* * ntfs_fill_super - Try to mount. */ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) { int err; struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; struct ntfs_mount_options *options; struct inode *inode; struct ntfs_inode *ni; size_t i, tt, bad_len, bad_frags; CLST vcn, lcn, len; struct ATTRIB *attr; const struct VOLUME_INFO *info; u32 idx, done, bytes; struct ATTR_DEF_ENTRY *t; u16 *shared; struct MFT_REF ref; bool ro = sb_rdonly(sb); struct NTFS_BOOT *boot2 = NULL; ref.high = 0; sbi->sb = sb; sbi->options = options = fc->fs_private; fc->fs_private = NULL; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" sb->s_op = &ntfs_sops; sb->s_export_op = &ntfs_export_ops; sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL; options->nls = ntfs_load_nls(options->nls_name); if (IS_ERR(options->nls)) { options->nls = NULL; errorf(fc, "Cannot load nls %s", options->nls_name); err = -EINVAL; goto out; } if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) { sbi->discard_granularity = bdev_discard_granularity(bdev); sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } /* Parse boot. */ err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev), bdev_nr_bytes(bdev), &boot2); if (err) goto out; /* * Load $Volume. This should be done before $LogFile * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'. */ ref.low = cpu_to_le32(MFT_REC_VOL); ref.seq = cpu_to_le16(MFT_REC_VOL); inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Volume (%d).", err); goto out; } ni = ntfs_i(inode); /* Load and save label (not necessary). */ attr = ni_find_attr(ni, NULL, NULL, ATTR_LABEL, NULL, 0, NULL, NULL); if (!attr) { /* It is ok if no ATTR_LABEL */ } else if (!attr->non_res && !is_attr_ext(attr)) { /* $AttrDef allows labels to be up to 128 symbols. */ err = utf16s_to_utf8s(resident_data(attr), le32_to_cpu(attr->res.data_size) >> 1, UTF16_LITTLE_ENDIAN, sbi->volume.label, sizeof(sbi->volume.label)); if (err < 0) sbi->volume.label[0] = 0; } else { /* Should we break mounting here? */ //err = -EINVAL; //goto put_inode_out; } attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); if (!attr || is_attr_ext(attr) || !(info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO))) { ntfs_err(sb, "$Volume is corrupted."); err = -EINVAL; goto put_inode_out; } sbi->volume.major_ver = info->major_ver; sbi->volume.minor_ver = info->minor_ver; sbi->volume.flags = info->flags; sbi->volume.ni = ni; if (info->flags & VOLUME_FLAG_DIRTY) { sbi->volume.real_dirty = true; ntfs_info(sb, "It is recommened to use chkdsk."); } /* Load $MFTMirr to estimate recs_mirr. */ ref.low = cpu_to_le32(MFT_REC_MIRR); ref.seq = cpu_to_le16(MFT_REC_MIRR); inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFTMirr (%d).", err); goto out; } sbi->mft.recs_mirr = ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits; iput(inode); /* Load LogFile to replay. */ ref.low = cpu_to_le32(MFT_REC_LOG); ref.seq = cpu_to_le16(MFT_REC_LOG); inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load \x24LogFile (%d).", err); goto out; } ni = ntfs_i(inode); err = ntfs_loadlog_and_replay(ni, sbi); if (err) goto put_inode_out; iput(inode); if ((sbi->flags & NTFS_FLAGS_NEED_REPLAY) && !ro) { ntfs_warn(sb, "failed to replay log file. Can't mount rw!"); err = -EINVAL; goto out; } if ((sbi->volume.flags & VOLUME_FLAG_DIRTY) && !ro && !options->force) { ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!"); err = -EINVAL; goto out; } /* Load $MFT. */ ref.low = cpu_to_le32(MFT_REC_MFT); ref.seq = cpu_to_le16(1); inode = ntfs_iget5(sb, &ref, &NAME_MFT); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFT (%d).", err); goto out; } ni = ntfs_i(inode); sbi->mft.used = ni->i_valid >> sbi->record_bits; tt = inode->i_size >> sbi->record_bits; sbi->mft.next_free = MFT_REC_USER; err = wnd_init(&sbi->mft.bitmap, sb, tt); if (err) goto put_inode_out; err = ni_load_all_mi(ni); if (err) { ntfs_err(sb, "Failed to load $MFT's subrecords (%d).", err); goto put_inode_out; } sbi->mft.ni = ni; /* Load $Bitmap. */ ref.low = cpu_to_le32(MFT_REC_BITMAP); ref.seq = cpu_to_le16(MFT_REC_BITMAP); inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Bitmap (%d).", err); goto out; } #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (inode->i_size >> 32) { err = -EINVAL; goto put_inode_out; } #endif /* Check bitmap boundary. */ tt = sbi->used.bitmap.nbits; if (inode->i_size < bitmap_size(tt)) { ntfs_err(sb, "$Bitmap is corrupted."); err = -EINVAL; goto put_inode_out; } err = wnd_init(&sbi->used.bitmap, sb, tt); if (err) { ntfs_err(sb, "Failed to initialize $Bitmap (%d).", err); goto put_inode_out; } iput(inode); /* Compute the MFT zone. */ err = ntfs_refresh_zone(sbi); if (err) { ntfs_err(sb, "Failed to initialize MFT zone (%d).", err); goto out; } /* Load $BadClus. */ ref.low = cpu_to_le32(MFT_REC_BADCLUST); ref.seq = cpu_to_le16(MFT_REC_BADCLUST); inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $BadClus (%d).", err); goto out; } ni = ntfs_i(inode); bad_len = bad_frags = 0; for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) { if (lcn == SPARSE_LCN) continue; bad_len += len; bad_frags += 1; if (ro) continue; if (wnd_set_used_safe(&sbi->used.bitmap, lcn, len, &tt) || tt) { /* Bad blocks marked as free in bitmap. */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } } if (bad_len) { /* * Notice about bad blocks. * In normal cases these blocks are marked as used in bitmap. * And we never allocate space in it. */ ntfs_notice(sb, "Volume contains %zu bad blocks in %zu fragments.", bad_len, bad_frags); } iput(inode); /* Load $AttrDef. */ ref.low = cpu_to_le32(MFT_REC_ATTR); ref.seq = cpu_to_le16(MFT_REC_ATTR); inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $AttrDef (%d)", err); goto out; } /* * Typical $AttrDef contains up to 20 entries. * Check for extremely large/small size. */ if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY) || inode->i_size > 100 * sizeof(struct ATTR_DEF_ENTRY)) { ntfs_err(sb, "Looks like $AttrDef is corrupted (size=%llu).", inode->i_size); err = -EINVAL; goto put_inode_out; } bytes = inode->i_size; sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL); if (!t) { err = -ENOMEM; goto put_inode_out; } for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) { unsigned long tail = bytes - done; struct page *page = ntfs_map_page(inode->i_mapping, idx); if (IS_ERR(page)) { err = PTR_ERR(page); ntfs_err(sb, "Failed to read $AttrDef (%d).", err); goto put_inode_out; } memcpy(Add2Ptr(t, done), page_address(page), min(PAGE_SIZE, tail)); ntfs_unmap_page(page); if (!idx && ATTR_STD != t->type) { ntfs_err(sb, "$AttrDef is corrupted."); err = -EINVAL; goto put_inode_out; } } t += 1; sbi->def_entries = 1; done = sizeof(struct ATTR_DEF_ENTRY); sbi->reparse.max_size = MAXIMUM_REPARSE_DATA_BUFFER_SIZE; sbi->ea_max_size = 0x10000; /* default formatter value */ while (done + sizeof(struct ATTR_DEF_ENTRY) <= bytes) { u32 t32 = le32_to_cpu(t->type); u64 sz = le64_to_cpu(t->max_sz); if ((t32 & 0xF) || le32_to_cpu(t[-1].type) >= t32) break; if (t->type == ATTR_REPARSE) sbi->reparse.max_size = sz; else if (t->type == ATTR_EA) sbi->ea_max_size = sz; done += sizeof(struct ATTR_DEF_ENTRY); t += 1; sbi->def_entries += 1; } iput(inode); /* Load $UpCase. */ ref.low = cpu_to_le32(MFT_REC_UPCASE); ref.seq = cpu_to_le16(MFT_REC_UPCASE); inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $UpCase (%d).", err); goto out; } if (inode->i_size != 0x10000 * sizeof(short)) { err = -EINVAL; ntfs_err(sb, "$UpCase is corrupted."); goto put_inode_out; } for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) { const __le16 *src; u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT); struct page *page = ntfs_map_page(inode->i_mapping, idx); if (IS_ERR(page)) { err = PTR_ERR(page); ntfs_err(sb, "Failed to read $UpCase (%d).", err); goto put_inode_out; } src = page_address(page); #ifdef __BIG_ENDIAN for (i = 0; i < PAGE_SIZE / sizeof(u16); i++) *dst++ = le16_to_cpu(*src++); #else memcpy(dst, src, PAGE_SIZE); #endif ntfs_unmap_page(page); } shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short)); if (shared && sbi->upcase != shared) { kvfree(sbi->upcase); sbi->upcase = shared; } iput(inode); if (is_ntfs3(sbi)) { /* Load $Secure. */ err = ntfs_security_init(sbi); if (err) { ntfs_err(sb, "Failed to initialize $Secure (%d).", err); goto out; } /* Load $Extend. */ err = ntfs_extend_init(sbi); if (err) { ntfs_warn(sb, "Failed to initialize $Extend."); goto load_root; } /* Load $Extend/$Reparse. */ err = ntfs_reparse_init(sbi); if (err) { ntfs_warn(sb, "Failed to initialize $Extend/$Reparse."); goto load_root; } /* Load $Extend/$ObjId. */ err = ntfs_objid_init(sbi); if (err) { ntfs_warn(sb, "Failed to initialize $Extend/$ObjId."); goto load_root; } } load_root: /* Load root. */ ref.low = cpu_to_le32(MFT_REC_ROOT); ref.seq = cpu_to_le16(MFT_REC_ROOT); inode = ntfs_iget5(sb, &ref, &NAME_ROOT); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load root (%d).", err); goto out; } /* * Final check. Looks like this case should never occurs. */ if (!inode->i_op) { err = -EINVAL; ntfs_err(sb, "Failed to load root (%d).", err); goto put_inode_out; } sb->s_root = d_make_root(inode); if (!sb->s_root) { err = -ENOMEM; goto put_inode_out; } if (boot2) { /* * Alternative boot is ok but primary is not ok. * Volume is recognized as NTFS. Update primary boot. */ struct buffer_head *bh0 = sb_getblk(sb, 0); if (bh0) { if (buffer_locked(bh0)) __wait_on_buffer(bh0); lock_buffer(bh0); memcpy(bh0->b_data, boot2, sizeof(*boot2)); set_buffer_uptodate(bh0); mark_buffer_dirty(bh0); unlock_buffer(bh0); if (!sync_dirty_buffer(bh0)) ntfs_warn(sb, "primary boot is updated"); put_bh(bh0); } kfree(boot2); } #ifdef CONFIG_PROC_FS /* Create /proc/fs/ntfs3/.. */ if (proc_info_root) { struct proc_dir_entry *e = proc_mkdir(sb->s_id, proc_info_root); static_assert((S_IRUGO | S_IWUSR) == 0644); if (e) { proc_create_data("volinfo", S_IRUGO, e, &ntfs3_volinfo_fops, sb); proc_create_data("label", S_IRUGO | S_IWUSR, e, &ntfs3_label_fops, sb); sbi->procdir = e; } } #endif return 0; put_inode_out: iput(inode); out: ntfs3_put_sbi(sbi); kfree(boot2); ntfs3_put_sbi(sbi); return err; } void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len) { struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; sector_t devblock = (u64)lcn * sbi->blocks_per_cluster; unsigned long blocks = (u64)len * sbi->blocks_per_cluster; unsigned long cnt = 0; unsigned long limit = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - sb->s_blocksize_bits); if (limit >= 0x2000) limit -= 0x1000; else if (limit < 32) limit = 32; else limit >>= 1; while (blocks--) { clean_bdev_aliases(bdev, devblock++, 1); if (cnt++ >= limit) { sync_blockdev(bdev); cnt = 0; } } } /* * ntfs_discard - Issue a discard request (trim for SSD). */ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) { int err; u64 lbo, bytes, start, end; struct super_block *sb; if (sbi->used.next_free_lcn == lcn + len) sbi->used.next_free_lcn = lcn; if (sbi->flags & NTFS_FLAGS_NODISCARD) return -EOPNOTSUPP; if (!sbi->options->discard) return -EOPNOTSUPP; lbo = (u64)lcn << sbi->cluster_bits; bytes = (u64)len << sbi->cluster_bits; /* Align up 'start' on discard_granularity. */ start = (lbo + sbi->discard_granularity - 1) & sbi->discard_granularity_mask_inv; /* Align down 'end' on discard_granularity. */ end = (lbo + bytes) & sbi->discard_granularity_mask_inv; sb = sbi->sb; if (start >= end) return 0; err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9, GFP_NOFS); if (err == -EOPNOTSUPP) sbi->flags |= NTFS_FLAGS_NODISCARD; return err; } static int ntfs_fs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ntfs_fill_super); } /* * ntfs_fs_free - Free fs_context. * * Note that this will be called after fill_super and reconfigure * even when they pass. So they have to take pointers if they pass. */ static void ntfs_fs_free(struct fs_context *fc) { struct ntfs_mount_options *opts = fc->fs_private; struct ntfs_sb_info *sbi = fc->s_fs_info; if (sbi) { ntfs3_put_sbi(sbi); ntfs3_free_sbi(sbi); } if (opts) put_mount_options(opts); } // clang-format off static const struct fs_context_operations ntfs_context_ops = { .parse_param = ntfs_fs_parse_param, .get_tree = ntfs_fs_get_tree, .reconfigure = ntfs_fs_reconfigure, .free = ntfs_fs_free, }; // clang-format on /* * ntfs_init_fs_context - Initialize sbi and opts * * This will called when mount/remount. We will first initialize * options so that if remount we can use just that. */ static int ntfs_init_fs_context(struct fs_context *fc) { struct ntfs_mount_options *opts; struct ntfs_sb_info *sbi; opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS); if (!opts) return -ENOMEM; /* Default options. */ opts->fs_uid = current_uid(); opts->fs_gid = current_gid(); opts->fs_fmask_inv = ~current_umask(); opts->fs_dmask_inv = ~current_umask(); if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) goto ok; sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); if (!sbi) goto free_opts; sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); if (!sbi->upcase) goto free_sbi; ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); mutex_init(&sbi->compress.mtx_lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS mutex_init(&sbi->compress.mtx_xpress); mutex_init(&sbi->compress.mtx_lzx); #endif fc->s_fs_info = sbi; ok: fc->fs_private = opts; fc->ops = &ntfs_context_ops; return 0; free_sbi: kfree(sbi); free_opts: kfree(opts); return -ENOMEM; } static void ntfs3_kill_sb(struct super_block *sb) { struct ntfs_sb_info *sbi = sb->s_fs_info; kill_block_super(sb); if (sbi->options) put_mount_options(sbi->options); ntfs3_free_sbi(sbi); } // clang-format off static struct file_system_type ntfs_fs_type = { .owner = THIS_MODULE, .name = "ntfs3", .init_fs_context = ntfs_init_fs_context, .parameters = ntfs_fs_parameters, .kill_sb = ntfs3_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; // clang-format on static int __init init_ntfs_fs(void) { int err; pr_info("ntfs3: Max link count %u\n", NTFS_LINK_MAX); if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL)) pr_info("ntfs3: Enabled Linux POSIX ACLs support\n"); if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER)) pr_notice( "ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n"); if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); #ifdef CONFIG_PROC_FS /* Create "/proc/fs/ntfs3" */ proc_info_root = proc_mkdir("fs/ntfs3", NULL); #endif err = ntfs3_init_bitmap(); if (err) return err; ntfs_inode_cachep = kmem_cache_create( "ntfs_inode_cache", sizeof(struct ntfs_inode), 0, (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); if (!ntfs_inode_cachep) { err = -ENOMEM; goto out1; } err = register_filesystem(&ntfs_fs_type); if (err) goto out; return 0; out: kmem_cache_destroy(ntfs_inode_cachep); out1: ntfs3_exit_bitmap(); return err; } static void __exit exit_ntfs_fs(void) { rcu_barrier(); kmem_cache_destroy(ntfs_inode_cachep); unregister_filesystem(&ntfs_fs_type); ntfs3_exit_bitmap(); #ifdef CONFIG_PROC_FS if (proc_info_root) remove_proc_entry("fs/ntfs3", NULL); #endif } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ntfs3 read/write filesystem"); #ifdef CONFIG_NTFS3_FS_POSIX_ACL MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support"); #endif #ifdef CONFIG_NTFS3_64BIT_CLUSTER MODULE_INFO( cluster, "Warning: Activated 64 bits per cluster. Windows does not support this"); #endif #ifdef CONFIG_NTFS3_LZX_XPRESS MODULE_INFO(compression, "Read-only lzx/xpress compression included"); #endif MODULE_AUTHOR("Konstantin Komarov"); MODULE_ALIAS_FS("ntfs3"); module_init(init_ntfs_fs); module_exit(exit_ntfs_fs);
12 9 1 10 1 12 10 12 1 8 8 8 8 2 8 8 7 3 8 8 2 3 7 2 2 2 1 2 1 9 1 9 9 8 9 6 2 2 1 9 9 9 9 9 9 2 11 8 8 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 /* * 8259 interrupt controller emulation * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Port from Qemu. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/slab.h> #include <linux/bitops.h> #include "irq.h" #include <linux/kvm_host.h> #include "trace.h" #define pr_pic_unimpl(fmt, ...) \ pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__) static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) { spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { bool wakeup = s->wakeup_needed; struct kvm_vcpu *vcpu; unsigned long i; s->wakeup_needed = false; spin_unlock(&s->lock); if (wakeup) { kvm_for_each_vcpu(i, vcpu, s->kvm) { if (kvm_apic_accept_pic_intr(vcpu)) { kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); return; } } } } static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); if (s != &s->pics_state->pics[0]) irq += 8; /* * We are dropping lock while calling ack notifiers since ack * notifier callbacks for assigned devices call into PIC recursively. * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ pic_unlock(s->pics_state); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); pic_lock(s->pics_state); } /* * set irq level. If an edge is detected, then the IRR is set to 1 */ static inline int pic_set_irq1(struct kvm_kpic_state *s, int irq, int level) { int mask, ret = 1; mask = 1 << irq; if (s->elcr & mask) /* level triggered */ if (level) { ret = !(s->irr & mask); s->irr |= mask; s->last_irr |= mask; } else { s->irr &= ~mask; s->last_irr &= ~mask; } else /* edge triggered */ if (level) { if ((s->last_irr & mask) == 0) { ret = !(s->irr & mask); s->irr |= mask; } s->last_irr |= mask; } else s->last_irr &= ~mask; return (s->imr & mask) ? -1 : ret; } /* * return the highest priority found in mask (highest = smallest * number). Return 8 if no irq */ static inline int get_priority(struct kvm_kpic_state *s, int mask) { int priority; if (mask == 0) return 8; priority = 0; while ((mask & (1 << ((priority + s->priority_add) & 7))) == 0) priority++; return priority; } /* * return the pic wanted interrupt. return -1 if none */ static int pic_get_irq(struct kvm_kpic_state *s) { int mask, cur_priority, priority; mask = s->irr & ~s->imr; priority = get_priority(s, mask); if (priority == 8) return -1; /* * compute current priority. If special fully nested mode on the * master, the IRQ coming from the slave is not taken into account * for the priority computation. */ mask = s->isr; if (s->special_fully_nested_mode && s == &s->pics_state->pics[0]) mask &= ~(1 << 2); cur_priority = get_priority(s, mask); if (priority < cur_priority) /* * higher priority found: an irq should be generated */ return (priority + s->priority_add) & 7; else return -1; } /* * raise irq to CPU if necessary. must be called every time the active * irq may change */ static void pic_update_irq(struct kvm_pic *s) { int irq2, irq; irq2 = pic_get_irq(&s->pics[1]); if (irq2 >= 0) { /* * if irq request by slave pic, signal master PIC */ pic_set_irq1(&s->pics[0], 2, 1); pic_set_irq1(&s->pics[0], 2, 0); } irq = pic_get_irq(&s->pics[0]); pic_irq_request(s->kvm, irq >= 0); } void kvm_pic_update_irq(struct kvm_pic *s) { pic_lock(s); pic_update_irq(s); pic_unlock(s); } int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) { int ret, irq_level; BUG_ON(irq < 0 || irq >= PIC_NUM_PINS); pic_lock(s); irq_level = __kvm_irq_line_state(&s->irq_states[irq], irq_source_id, level); ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); pic_unlock(s); return ret; } void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) { int i; pic_lock(s); for (i = 0; i < PIC_NUM_PINS; i++) __clear_bit(irq_source_id, &s->irq_states[i]); pic_unlock(s); } /* * acknowledge interrupt 'irq' */ static inline void pic_intack(struct kvm_kpic_state *s, int irq) { s->isr |= 1 << irq; /* * We don't clear a level sensitive interrupt here */ if (!(s->elcr & (1 << irq))) s->irr &= ~(1 << irq); if (s->auto_eoi) { if (s->rotate_on_auto_eoi) s->priority_add = (irq + 1) & 7; pic_clear_isr(s, irq); } } int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; struct kvm_pic *s = kvm->arch.vpic; s->output = 0; pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); if (irq == 2) { irq2 = pic_get_irq(&s->pics[1]); if (irq2 >= 0) pic_intack(&s->pics[1], irq2); else /* * spurious IRQ on slave controller */ irq2 = 7; intno = s->pics[1].irq_base + irq2; } else intno = s->pics[0].irq_base + irq; } else { /* * spurious IRQ on host controller */ irq = 7; intno = s->pics[0].irq_base + irq; } pic_update_irq(s); pic_unlock(s); return intno; } static void kvm_pic_reset(struct kvm_kpic_state *s) { int irq; unsigned long i; struct kvm_vcpu *vcpu; u8 edge_irr = s->irr & ~s->elcr; bool found = false; s->last_irr = 0; s->irr &= s->elcr; s->imr = 0; s->priority_add = 0; s->special_mask = 0; s->read_reg_select = 0; if (!s->init4) { s->special_fully_nested_mode = 0; s->auto_eoi = 0; } s->init_state = 1; kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) if (kvm_apic_accept_pic_intr(vcpu)) { found = true; break; } if (!found) return; for (irq = 0; irq < PIC_NUM_PINS/2; irq++) if (edge_irr & (1 << irq)) pic_clear_isr(s, irq); } static void pic_ioport_write(void *opaque, u32 addr, u32 val) { struct kvm_kpic_state *s = opaque; int priority, cmd, irq; addr &= 1; if (addr == 0) { if (val & 0x10) { s->init4 = val & 1; if (val & 0x02) pr_pic_unimpl("single mode not supported"); if (val & 0x08) pr_pic_unimpl( "level sensitive irq not supported"); kvm_pic_reset(s); } else if (val & 0x08) { if (val & 0x04) s->poll = 1; if (val & 0x02) s->read_reg_select = val & 1; if (val & 0x40) s->special_mask = (val >> 5) & 1; } else { cmd = val >> 5; switch (cmd) { case 0: case 4: s->rotate_on_auto_eoi = cmd >> 2; break; case 1: /* end of interrupt */ case 5: priority = get_priority(s, s->isr); if (priority != 8) { irq = (priority + s->priority_add) & 7; if (cmd == 5) s->priority_add = (irq + 1) & 7; pic_clear_isr(s, irq); pic_update_irq(s->pics_state); } break; case 3: irq = val & 7; pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; case 6: s->priority_add = (val + 1) & 7; pic_update_irq(s->pics_state); break; case 7: irq = val & 7; s->priority_add = (irq + 1) & 7; pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; default: break; /* no operation */ } } } else switch (s->init_state) { case 0: { /* normal mode */ u8 imr_diff = s->imr ^ val, off = (s == &s->pics_state->pics[0]) ? 0 : 8; s->imr = val; for (irq = 0; irq < PIC_NUM_PINS/2; irq++) if (imr_diff & (1 << irq)) kvm_fire_mask_notifiers( s->pics_state->kvm, SELECT_PIC(irq + off), irq + off, !!(s->imr & (1 << irq))); pic_update_irq(s->pics_state); break; } case 1: s->irq_base = val & 0xf8; s->init_state = 2; break; case 2: if (s->init4) s->init_state = 3; else s->init_state = 0; break; case 3: s->special_fully_nested_mode = (val >> 4) & 1; s->auto_eoi = (val >> 1) & 1; s->init_state = 0; break; } } static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) { int ret; ret = pic_get_irq(s); if (ret >= 0) { if (addr1 >> 7) { s->pics_state->pics[0].isr &= ~(1 << 2); s->pics_state->pics[0].irr &= ~(1 << 2); } s->irr &= ~(1 << ret); pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); /* Bit 7 is 1, means there's an interrupt */ ret |= 0x80; } else { /* Bit 7 is 0, means there's no interrupt */ ret = 0x07; pic_update_irq(s->pics_state); } return ret; } static u32 pic_ioport_read(void *opaque, u32 addr) { struct kvm_kpic_state *s = opaque; int ret; if (s->poll) { ret = pic_poll_read(s, addr); s->poll = 0; } else if ((addr & 1) == 0) if (s->read_reg_select) ret = s->isr; else ret = s->irr; else ret = s->imr; return ret; } static void elcr_ioport_write(void *opaque, u32 val) { struct kvm_kpic_state *s = opaque; s->elcr = val & s->elcr_mask; } static u32 elcr_ioport_read(void *opaque) { struct kvm_kpic_state *s = opaque; return s->elcr; } static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { unsigned char data = *(unsigned char *)val; if (len != 1) { pr_pic_unimpl("non byte write\n"); return 0; } switch (addr) { case 0x20: case 0x21: pic_lock(s); pic_ioport_write(&s->pics[0], addr, data); pic_unlock(s); break; case 0xa0: case 0xa1: pic_lock(s); pic_ioport_write(&s->pics[1], addr, data); pic_unlock(s); break; case 0x4d0: case 0x4d1: pic_lock(s); elcr_ioport_write(&s->pics[addr & 1], data); pic_unlock(s); break; default: return -EOPNOTSUPP; } return 0; } static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { unsigned char *data = (unsigned char *)val; if (len != 1) { memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: pic_lock(s); *data = pic_ioport_read(&s->pics[addr >> 7], addr); pic_unlock(s); break; case 0x4d0: case 0x4d1: pic_lock(s); *data = elcr_ioport_read(&s->pics[addr & 1]); pic_unlock(s); break; default: return -EOPNOTSUPP; } return 0; } static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { return picdev_write(container_of(dev, struct kvm_pic, dev_master), addr, len, val); } static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { return picdev_read(container_of(dev, struct kvm_pic, dev_master), addr, len, val); } static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { return picdev_write(container_of(dev, struct kvm_pic, dev_slave), addr, len, val); } static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { return picdev_read(container_of(dev, struct kvm_pic, dev_slave), addr, len, val); } static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { return picdev_write(container_of(dev, struct kvm_pic, dev_elcr), addr, len, val); } static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { return picdev_read(container_of(dev, struct kvm_pic, dev_elcr), addr, len, val); } /* * callback when PIC0 irq status changed */ static void pic_irq_request(struct kvm *kvm, int level) { struct kvm_pic *s = kvm->arch.vpic; if (!s->output) s->wakeup_needed = true; s->output = level; } static const struct kvm_io_device_ops picdev_master_ops = { .read = picdev_master_read, .write = picdev_master_write, }; static const struct kvm_io_device_ops picdev_slave_ops = { .read = picdev_slave_read, .write = picdev_slave_write, }; static const struct kvm_io_device_ops picdev_elcr_ops = { .read = picdev_elcr_read, .write = picdev_elcr_write, }; int kvm_pic_init(struct kvm *kvm) { struct kvm_pic *s; int ret; s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT); if (!s) return -ENOMEM; spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; s->pics[0].pics_state = s; s->pics[1].pics_state = s; /* * Initialize PIO device */ kvm_iodevice_init(&s->dev_master, &picdev_master_ops); kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops); mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, &s->dev_master); if (ret < 0) goto fail_unlock; ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave); if (ret < 0) goto fail_unreg_2; ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr); if (ret < 0) goto fail_unreg_1; mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = s; return 0; fail_unreg_1: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); fail_unreg_2: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master); fail_unlock: mutex_unlock(&kvm->slots_lock); kfree(s); return ret; } void kvm_pic_destroy(struct kvm *kvm) { struct kvm_pic *vpic = kvm->arch.vpic; if (!vpic) return; mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr); mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = NULL; kfree(vpic); }
382 28844 32 32 5 5 60 3 61 54 61 373 62 306 101 101 101 234 235 235 29481 44681 29484 34499 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> #include <asm/irq_regs.h> #include <uapi/asm/kvm.h> #include <linux/hardirq.h> #include <linux/pkeys.h> #include <linux/vmalloc.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> #ifdef CONFIG_X86_64 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); DEFINE_PER_CPU(u64, xfd_state); #endif /* The FPU state configuration data for kernel and user space */ struct fpu_state_config fpu_kernel_cfg __ro_after_init; struct fpu_state_config fpu_user_cfg __ro_after_init; /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ struct fpstate init_fpstate __ro_after_init; /* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? */ bool irq_fpu_usable(void) { if (WARN_ON_ONCE(in_nmi())) return false; /* In kernel FPU usage already active? */ if (this_cpu_read(in_kernel_fpu)) return false; /* * When not in NMI or hard interrupt context, FPU can be used in: * * - Task context except from within fpregs_lock()'ed critical * regions. * * - Soft interrupt processing context which cannot happen * while in a fpregs_lock()'ed critical region. */ if (!in_hardirq()) return true; /* * In hard interrupt context it's safe when soft interrupts * are enabled, which means the interrupt did not hit in * a fpregs_lock()'ed critical region. */ return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); /* * Track AVX512 state use because it is known to slow the max clock * speed of the core. */ static void update_avx_timestamp(struct fpu *fpu) { #define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) fpu->avx512_timestamp = jiffies; } /* * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. * * The legacy FNSAVE instruction clears all FPU state unconditionally, so * register state has to be reloaded. That might be a pointless exercise * when the FPU is going to be used by another task right after that. But * this only affects 20+ years old 32bit systems and avoids conditionals all * over the place. * * FXSAVE and all XSAVE variants preserve the FPU register state. */ void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(fpu->fpstate); update_avx_timestamp(fpu); return; } if (likely(use_fxsr())) { fxsave(&fpu->fpstate->regs.fxsave); return; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); frstor(&fpu->fpstate->regs.fsave); } void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore * FDP/FIP/FOP unless an exception is pending. Clear the x87 state * here by setting it to fixed values. "m" is a random variable * that should be in L1. */ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" "fildl %P[addr]" /* set F?P to defined value */ : : [addr] "m" (fpstate)); } if (use_xsave()) { /* * Dynamically enabled features are enabled in XCR0, but * usage requires also that the corresponding bits in XFD * are cleared. If the bits are set then using a related * instruction will raise #NM. This allows to do the * allocation of the larger FPU buffer lazy from #NM or if * the task has no permission to kill it which would happen * via #UD if the feature is disabled in XCR0. * * XFD state is following the same life time rules as * XSTATE and to restore state correctly XFD has to be * updated before XRSTORS otherwise the component would * stay in or go into init state even if the bits are set * in fpstate::regs::xsave::xfeatures. */ xfd_update_state(fpstate); /* * Restoring state always needs to modify all features * which are in @mask even if the current task cannot use * extended features. * * So fpstate->xfeatures cannot be used here, because then * a feature for which the task has no permission but was * used by the previous task would not go into init state. */ mask = fpu_kernel_cfg.max_features & mask; os_xrstor(fpstate, mask); } else { if (use_fxsr()) fxrstor(&fpstate->regs.fxsave); else frstor(&fpstate->regs.fsave); } } void fpu_reset_from_exception_fixup(void) { restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); } #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); static void fpu_init_guest_permissions(struct fpu_guest *gfpu) { struct fpu_state_perm *fpuperm; u64 perm; if (!IS_ENABLED(CONFIG_X86_64)) return; spin_lock_irq(&current->sighand->siglock); fpuperm = &current->group_leader->thread.fpu.guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(&current->sighand->siglock); gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fpstate; unsigned int size; size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); fpstate = vzalloc(size); if (!fpstate) return false; /* Leave xfd to 0 (the reset value defined by spec) */ __fpstate_reset(fpstate, 0); fpstate_init_user(fpstate); fpstate->is_valloc = true; fpstate->is_guest = true; gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_user_cfg.default_features; gfpu->perm = fpu_user_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state * to userspace, even when XSAVE is unsupported, so that restoring FPU * state on a different CPU that does support XSAVE can cleanly load * the incoming state using its natural XSAVE. In other words, KVM's * uABI size may be larger than this host's default size. Conversely, * the default size should never be larger than KVM's base uABI size; * all features that can expand the uABI size must be opt-in. */ gfpu->uabi_size = sizeof(struct kvm_xsave); if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; fpu_init_guest_permissions(gfpu); return true; } EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fps = gfpu->fpstate; if (!fps) return; if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) return; gfpu->fpstate = NULL; vfree(fps); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); /* * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable * @guest_fpu: Pointer to the guest FPU container * @xfeatures: Features requested by guest CPUID * * Enable all dynamic xfeatures according to guest perm and requested CPUID. * * Return: 0 on success, error code otherwise */ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) { lockdep_assert_preemption_enabled(); /* Nothing to do if all requested features are already enabled. */ xfeatures &= ~guest_fpu->xfeatures; if (!xfeatures) return 0; return __xfd_enable_feature(xfeatures, guest_fpu); } EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { fpregs_lock(); guest_fpu->fpstate->xfd = xfd; if (guest_fpu->fpstate->in_use) xfd_update_state(guest_fpu->fpstate); fpregs_unlock(); } EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); /** * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state * * Must be invoked from KVM after a VMEXIT before enabling interrupts when * XFD write emulation is disabled. This is required because the guest can * freely modify XFD and the state at VMEXIT is not guaranteed to be the * same as the state on VMENTER. So software state has to be udpated before * any operation which depends on it can take place. * * Note: It can be invoked unconditionally even when write emulation is * enabled for the price of a then pointless MSR read. */ void fpu_sync_guest_vmexit_xfd_state(void) { struct fpstate *fps = current->thread.fpu.fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { rdmsrl(MSR_IA32_XFD, fps->xfd); __this_cpu_write(xfd_state, fps->xfd); } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); #endif /* CONFIG_X86_64 */ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; struct fpu *fpu = &current->thread.fpu; struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); /* Swap fpstate */ if (enter_guest) { fpu->__task_fpstate = cur_fps; fpu->fpstate = guest_fps; guest_fps->in_use = true; } else { guest_fps->in_use = false; fpu->fpstate = fpu->__task_fpstate; fpu->__task_fpstate = NULL; } cur_fps = fpu->fpstate; if (!cur_fps->is_confidential) { /* Includes XFD update */ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); } else { /* * XSTATE is restored by firmware from encrypted * memory. Make sure XFD state is correct while * running with guest fpstate */ xfd_update_state(cur_fps); } fpregs_mark_activate(); fpregs_unlock(); return 0; } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); /* Make it restorable on a XSAVE enabled host */ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } } EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru) { struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) return -EINVAL; if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); return 0; } if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; /* * Nullify @vpkru to preserve its current value if PKRU's bit isn't set * in the header. KVM's odd ABI is to leave PKRU untouched in this * case (all other components are eventually re-initialized). */ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) vpkru = NULL; return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(&current->thread.fpu); } __cpu_invalidate_fpregs_state(); /* Put sane initial values into the control registers. */ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); /* * Sync the FPU register state to current's memory register state when the * current task owns the FPU. The hardware register state is preserved. */ void fpu_sync_fpstate(struct fpu *fpu) { WARN_ON_FPU(fpu != &current->thread.fpu); fpregs_lock(); trace_x86_fpu_before_save(fpu); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); trace_x86_fpu_after_save(fpu); fpregs_unlock(); } static inline unsigned int init_fpstate_copy_size(void) { if (!use_xsave()) return fpu_kernel_cfg.default_size; /* XSAVE(S) just needs the legacy and the xstate header part */ return sizeof(init_fpstate.regs.xsave); } static inline void fpstate_init_fxstate(struct fpstate *fpstate) { fpstate->regs.fxsave.cwd = 0x37f; fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ static inline void fpstate_init_fstate(struct fpstate *fpstate) { fpstate->regs.fsave.cwd = 0xffff037fu; fpstate->regs.fsave.swd = 0xffff0000u; fpstate->regs.fsave.twd = 0xffffffffu; fpstate->regs.fsave.fos = 0xffff0000u; } /* * Used in two places: * 1) Early boot to setup init_fpstate for non XSAVE systems * 2) fpu_init_fpstate_user() which is invoked from KVM */ void fpstate_init_user(struct fpstate *fpstate) { if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpstate_init_soft(&fpstate->regs.soft); return; } xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); if (cpu_feature_enabled(X86_FEATURE_FXSR)) fpstate_init_fxstate(fpstate); else fpstate_init_fstate(fpstate); } static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) { /* Initialize sizes and feature masks */ fpstate->size = fpu_kernel_cfg.default_size; fpstate->user_size = fpu_user_cfg.default_size; fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->user_xfeatures = fpu_user_cfg.default_features; fpstate->xfd = xfd; } void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; __fpstate_reset(fpu->fpstate, init_fpstate.xfd); /* Initialize the permission related info in fpu */ fpu->perm.__state_perm = fpu_kernel_cfg.default_features; fpu->perm.__state_size = fpu_kernel_cfg.default_size; fpu->perm.__user_state_size = fpu_user_cfg.default_size; /* Same defaults for guests */ fpu->guest_perm = fpu->perm; } static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { struct fpu *src_fpu = &current->group_leader->thread.fpu; spin_lock_irq(&current->sighand->siglock); /* Fork also inherits the permissions of the parent */ dst_fpu->perm = src_fpu->perm; dst_fpu->guest_perm = src_fpu->guest_perm; spin_unlock_irq(&current->sighand->siglock); } } /* A passed ssp of zero will not cause any update */ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) { #ifdef CONFIG_X86_USER_SHADOW_STACK struct cet_user_state *xstate; /* If ssp update is not needed. */ if (!ssp) return 0; xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, XFEATURE_CET_USER); /* * If there is a non-zero ssp, then 'dst' must be configured with a shadow * stack and the fpu state should be up to date since it was just copied * from the parent in fpu_clone(). So there must be a valid non-init CET * state location in the buffer. */ if (WARN_ON_ONCE(!xstate)) return 1; xstate->user_ssp = (u64)ssp; #endif return 0; } /* Clone current's FPU state on fork */ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { struct fpu *src_fpu = &current->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; fpstate_reset(dst_fpu); if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; /* * Enforce reload for user space tasks and prevent kernel threads * from trying to save the FPU registers on context switch. */ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); /* * No FPU state inheritance for kernel threads and IO * worker threads. */ if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); return 0; } /* * If a new feature is added, ensure all dynamic features are * caller-saved from here! */ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); /* * Save the default portion of the current FPU state into the * clone. Assume all dynamic features to be defined as caller- * saved, which enables skipping both the expansion of fpstate * and the copying of any dynamic state. * * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); save_fpregs_to_fpstate(dst_fpu); fpregs_unlock(); if (!(clone_flags & CLONE_THREAD)) fpu_inherit_perms(dst_fpu); /* * Children never inherit PASID state. * Force it to have its init value: */ if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; /* * Update shadow stack pointer, in case it changed during clone. */ if (update_fpu_shstk(dst, ssp)) return 1; trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* * Whitelist the FPU register state embedded into task_struct for hardened * usercopy. */ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); *size = fpu_kernel_cfg.default_size; } /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. * * This function can be used in cases where we know that * a state-restore is coming: either an explicit one, * or a reschedule. */ void fpu__drop(struct fpu *fpu) { preempt_disable(); if (fpu == &current->thread.fpu) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } trace_x86_fpu_dropped(fpu); preempt_enable(); } /* * Clear FPU registers by setting them up from the init fpstate. * Caller must do fpregs_[un]lock() around it. */ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) fxrstor(&init_fpstate.regs.fxsave); else frstor(&init_fpstate.regs.fsave); pkru_write_default(); } /* * Reset current->fpu memory state to the init values. */ static void fpu_reset_fpregs(void) { struct fpu *fpu = &current->thread.fpu; fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); /* * This does not change the actual hardware registers. It just * resets the memory image and sets TIF_NEED_FPU_LOAD so a * subsequent return to usermode will reload the registers from the * task's memory image. * * Do not use fpstate_init() here. Just copy init_fpstate which has * the correct content already except for PKRU. * * PKRU handling does not rely on the xstate when restoring for * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } /* * Reset current's user FPU states to the init states. current's * supervisor states, if any, are not modified by this function. The * caller guarantees that the XSTATE header in memory is intact. */ void fpu__clear_user_states(struct fpu *fpu) { WARN_ON_FPU(fpu != &current->thread.fpu); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpu_reset_fpregs(); fpregs_unlock(); return; } /* * Ensure that current's supervisor states are loaded into their * corresponding registers. */ if (xfeatures_mask_supervisor() && !fpregs_state_valid(fpu, smp_processor_id())) os_xrstor_supervisor(fpu->fpstate); /* Reset user states in registers. */ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU * state machine that current's FPU registers are in the hardware * registers. The memory image does not need to be updated because * any operation relying on it has to save the registers first when * current's FPU is marked active. */ fpregs_mark_activate(); fpregs_unlock(); } void fpu_flush_thread(void) { fpstate_reset(&current->thread.fpu); fpu_reset_fpregs(); } /* * Load FPU context before returning to userspace. */ void switch_fpu_return(void) { if (!static_cpu_has(X86_FEATURE_FPU)) return; fpregs_restore_userregs(); } EXPORT_SYMBOL_GPL(switch_fpu_return); void fpregs_lock_and_load(void) { /* * fpregs_lock() only disables preemption (mostly). So modifying state * in an interrupt could screw up some in progress fpregs operation. * Warn about it. */ WARN_ON_ONCE(!irq_fpu_usable()); WARN_ON_ONCE(current->flags & PF_KTHREAD); fpregs_lock(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); } #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is * loaded on return to userland. */ void fpregs_assert_state_consistent(void) { struct fpu *fpu = &current->thread.fpu; if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) { struct fpu *fpu = &current->thread.fpu; fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } /* * x87 math exception handling: */ int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; if (trap_nr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { cwd = fpu->fpstate->regs.fxsave.cwd; swd = fpu->fpstate->regs.fxsave.swd; } else { cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; } else { /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ return FPE_FLTINV; } else if (err & 0x004) { /* Divide by Zero */ return FPE_FLTDIV; } else if (err & 0x008) { /* Overflow */ return FPE_FLTOVF; } else if (err & 0x012) { /* Denormal, Underflow */ return FPE_FLTUND; } else if (err & 0x020) { /* Precision */ return FPE_FLTRES; } /* * If we're using IRQ 13, or supposedly even some trap * X86_TRAP_MF implementations, it's possible * we get a spurious trap, which is not an error. */ return 0; } /* * Initialize register state that may prevent from entering low-power idle. * This function will be invoked from the cpuidle driver only when needed. */ noinstr void fpu_idle_fpregs(void) { /* Note: AMX_TILE being enabled implies XGETBV1 support */ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { tile_release(); __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } }
3 3 4 4 3 3 3 3 3 3 2 2 1 17 1 5 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ */ #include "internal.h" struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { struct page *page = *pagepool; if (page) { DBG_BUGON(page_ref_count(page) != 1); *pagepool = (struct page *)page_private(page); } else { page = alloc_page(gfp); } return page; } void erofs_release_pages(struct page **pagepool) { while (*pagepool) { struct page *page = *pagepool; *pagepool = (struct page *)page_private(page); put_page(page); } } #ifdef CONFIG_EROFS_FS_ZIP /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; static bool erofs_workgroup_get(struct erofs_workgroup *grp) { if (lockref_get_not_zero(&grp->lockref)) return true; spin_lock(&grp->lockref.lock); if (__lockref_is_dead(&grp->lockref)) { spin_unlock(&grp->lockref.lock); return false; } if (!grp->lockref.count++) atomic_long_dec(&erofs_global_shrink_cnt); spin_unlock(&grp->lockref.lock); return true; } struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, pgoff_t index) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_workgroup *grp; repeat: rcu_read_lock(); grp = xa_load(&sbi->managed_pslots, index); if (grp) { if (!erofs_workgroup_get(grp)) { /* prefer to relax rcu read side */ rcu_read_unlock(); goto repeat; } DBG_BUGON(index != grp->index); } rcu_read_unlock(); return grp; } struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, struct erofs_workgroup *grp) { struct erofs_sb_info *const sbi = EROFS_SB(sb); struct erofs_workgroup *pre; DBG_BUGON(grp->lockref.count < 1); repeat: xa_lock(&sbi->managed_pslots); pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, NULL, grp, GFP_NOFS); if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); } else if (!erofs_workgroup_get(pre)) { /* try to legitimize the current in-tree one */ xa_unlock(&sbi->managed_pslots); cond_resched(); goto repeat; } grp = pre; } xa_unlock(&sbi->managed_pslots); return grp; } static void __erofs_workgroup_free(struct erofs_workgroup *grp) { atomic_long_dec(&erofs_global_shrink_cnt); erofs_workgroup_free_rcu(grp); } void erofs_workgroup_put(struct erofs_workgroup *grp) { if (lockref_put_or_lock(&grp->lockref)) return; DBG_BUGON(__lockref_is_dead(&grp->lockref)); if (grp->lockref.count == 1) atomic_long_inc(&erofs_global_shrink_cnt); --grp->lockref.count; spin_unlock(&grp->lockref.lock); } static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, struct erofs_workgroup *grp) { int free = false; spin_lock(&grp->lockref.lock); if (grp->lockref.count) goto out; /* * Note that all cached pages should be detached before deleted from * the XArray. Otherwise some cached pages could be still attached to * the orphan old workgroup when the new one is available in the tree. */ if (erofs_try_to_free_all_cached_pages(sbi, grp)) goto out; /* * It's impossible to fail after the workgroup is freezed, * however in order to avoid some race conditions, add a * DBG_BUGON to observe this in advance. */ DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); lockref_mark_dead(&grp->lockref); free = true; out: spin_unlock(&grp->lockref.lock); if (free) __erofs_workgroup_free(grp); return free; } static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, unsigned long nr_shrink) { struct erofs_workgroup *grp; unsigned int freed = 0; unsigned long index; xa_lock(&sbi->managed_pslots); xa_for_each(&sbi->managed_pslots, index, grp) { /* try to shrink each valid workgroup */ if (!erofs_try_to_release_workgroup(sbi, grp)) continue; xa_unlock(&sbi->managed_pslots); ++freed; if (!--nr_shrink) return freed; xa_lock(&sbi->managed_pslots); } xa_unlock(&sbi->managed_pslots); return freed; } /* protected by 'erofs_sb_list_lock' */ static unsigned int shrinker_run_no; /* protects the mounted 'erofs_sb_list' */ static DEFINE_SPINLOCK(erofs_sb_list_lock); static LIST_HEAD(erofs_sb_list); void erofs_shrinker_register(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); mutex_init(&sbi->umount_mutex); spin_lock(&erofs_sb_list_lock); list_add(&sbi->list, &erofs_sb_list); spin_unlock(&erofs_sb_list_lock); } void erofs_shrinker_unregister(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); mutex_lock(&sbi->umount_mutex); /* clean up all remaining workgroups in memory */ erofs_shrink_workstation(sbi, ~0UL); spin_lock(&erofs_sb_list_lock); list_del(&sbi->list); spin_unlock(&erofs_sb_list_lock); mutex_unlock(&sbi->umount_mutex); } static unsigned long erofs_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return atomic_long_read(&erofs_global_shrink_cnt); } static unsigned long erofs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct erofs_sb_info *sbi; struct list_head *p; unsigned long nr = sc->nr_to_scan; unsigned int run_no; unsigned long freed = 0; spin_lock(&erofs_sb_list_lock); do { run_no = ++shrinker_run_no; } while (run_no == 0); /* Iterate over all mounted superblocks and try to shrink them */ p = erofs_sb_list.next; while (p != &erofs_sb_list) { sbi = list_entry(p, struct erofs_sb_info, list); /* * We move the ones we do to the end of the list, so we stop * when we see one we have already done. */ if (sbi->shrinker_run_no == run_no) break; if (!mutex_trylock(&sbi->umount_mutex)) { p = p->next; continue; } spin_unlock(&erofs_sb_list_lock); sbi->shrinker_run_no = run_no; freed += erofs_shrink_workstation(sbi, nr - freed); spin_lock(&erofs_sb_list_lock); /* Get the next list element before we move this one */ p = p->next; /* * Move this one to the end of the list to provide some * fairness. */ list_move_tail(&sbi->list, &erofs_sb_list); mutex_unlock(&sbi->umount_mutex); if (freed >= nr) break; } spin_unlock(&erofs_sb_list_lock); return freed; } static struct shrinker *erofs_shrinker_info; int __init erofs_init_shrinker(void) { erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); if (!erofs_shrinker_info) return -ENOMEM; erofs_shrinker_info->count_objects = erofs_shrink_count; erofs_shrinker_info->scan_objects = erofs_shrink_scan; shrinker_register(erofs_shrinker_info); return 0; } void erofs_exit_shrinker(void) { shrinker_free(erofs_shrinker_info); } #endif /* !CONFIG_EROFS_FS_ZIP */
1 2 2 2 2 5 4 5 7 3 3 3 3 3 3 3 2 5 2 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 /* SPDX-License-Identifier: GPL-2.0 */ /* * fs/f2fs/segment.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/blkdev.h> #include <linux/backing-dev.h> /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ #define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) #define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) #define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE) #define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA)) static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, unsigned short seg_type) { f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); } #define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) #define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) #define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno)) #define IS_CURSEC(sbi, secno) \ (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \ (sbi)->segs_per_sec) || \ ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \ (sbi)->segs_per_sec)) #define MAIN_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr)) #define SEG0_BLKADDR(sbi) \ (SM_I(sbi) ? SM_I(sbi)->seg0_blkaddr : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment0_blkaddr)) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) #define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) \ (SM_I(sbi) ? SM_I(sbi)->segment_count : \ le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count)) #define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) #define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ ((!__is_valid_data_blkaddr(blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define CAP_BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ (sbi)->unusable_blocks_per_sec) #define CAP_SEGS_PER_SEC(sbi) \ ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\ (sbi)->log_blocks_per_seg)) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ (((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) #define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ DIV_ROUND_UP(MAIN_SEGS(sbi), SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. * RIGHT means allocating new sections towards the end of volume. * LEFT means the opposite direction. */ enum { ALLOC_RIGHT = 0, ALLOC_LEFT }; /* * In the victim_sel_policy->alloc_mode, there are three block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into * fragmented segment which has similar aging degree. */ enum { LFS = 0, SSR, AT_SSR, }; /* * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. * GC_AT is based on age-threshold algorithm. */ enum { GC_CB = 0, GC_GREEDY, GC_AT, ALLOC_NEXT, FLUSH_DEVICE, MAX_GC_POLICY, }; /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. */ enum { BG_GC = 0, FG_GC, }; /* for a function parameter to select a victim segment */ struct victim_sel_policy { int alloc_mode; /* LFS or SSR */ int gc_mode; /* GC_CB or GC_GREEDY */ unsigned long *dirty_bitmap; /* dirty segment/section bitmap */ unsigned int max_search; /* * maximum # of segments/sections * to search */ unsigned int offset; /* last scanned bitmap offset */ unsigned int ofs_unit; /* bitmap search unit */ unsigned int min_cost; /* minimum cost */ unsigned long long oldest_age; /* oldest age of segments having the same min cost */ unsigned int min_segno; /* segment # having min. cost */ unsigned long long age; /* mtime of GCed section*/ unsigned long long age_threshold;/* age threshold */ }; struct seg_entry { unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ unsigned int valid_blocks:10; /* # of valid blocks */ unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ #ifdef CONFIG_F2FS_CHECK_FS unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ #endif /* * # of valid blocks and the validity bitmap stored in the last * checkpoint pack. This information is used by the SSR mode. */ unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; unsigned long long mtime; /* modification time of the segment */ }; struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ }; #define MAX_SKIP_GC_COUNT 16 struct revoke_entry { struct list_head list; block_t old_addr; /* for revoking when fail to commit */ pgoff_t index; }; struct sit_info { block_t sit_base_addr; /* start block address of SIT area */ block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *bitmap; /* all bitmaps pointer */ char *sit_bitmap; /* SIT bitmap pointer */ #ifdef CONFIG_F2FS_CHECK_FS char *sit_bitmap_mir; /* SIT bitmap mirror */ /* bitmap of segments to be ignored by GC in case of errors */ unsigned long *invalid_segmap; #endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ struct rw_semaphore sentry_lock; /* to protect SIT cache */ struct seg_entry *sentries; /* SIT segment-level cache */ struct sec_entry *sec_entries; /* SIT section-level cache */ /* for cost-benefit algorithm in cleaning procedure */ unsigned long long elapsed_time; /* elapsed time after mount */ unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ unsigned long long dirty_min_mtime; /* rerange candidates in GC_AT */ unsigned long long dirty_max_mtime; /* rerange candidates in GC_AT */ unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { unsigned int start_segno; /* start segment number logically */ unsigned int free_segments; /* # of free segments */ unsigned int free_sections; /* # of free sections */ spinlock_t segmap_lock; /* free segmap lock */ unsigned long *free_segmap; /* free segment bitmap */ unsigned long *free_secmap; /* free section bitmap */ }; /* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */ enum dirty_type { DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */ DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */ DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */ DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */ DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */ DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */ DIRTY, /* to count # of dirty segments */ PRE, /* to count # of entirely obsolete segments */ NR_DIRTY_TYPE }; struct dirty_seglist_info { unsigned long *dirty_segmap[NR_DIRTY_TYPE]; unsigned long *dirty_secmap; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ unsigned long *victim_secmap; /* background GC victims */ unsigned long *pinned_secmap; /* pinned victims from foreground GC */ unsigned int pinned_secmap_cnt; /* count of victims which has pinned data */ bool enable_pin_section; /* enable pinning section */ }; /* for active log information */ struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ struct f2fs_summary_block *sum_blk; /* cached summary block */ struct rw_semaphore journal_rwsem; /* protect journal area */ struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ unsigned short seg_type; /* segment type like CURSEG_XXX_TYPE */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ bool inited; /* indicate inmem log is inited */ }; struct sit_entry_set { struct list_head set_list; /* link with all sit sets */ unsigned int start_segno; /* start segno of sits in set */ unsigned int entry_cnt; /* the # of sit entries in set */ }; /* * inline functions */ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type) { return (struct curseg_info *)(SM_I(sbi)->curseg_array + type); } static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); return &sit_i->sentries[segno]; } static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ if (use_section && __is_large_section(sbi)) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; } static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, unsigned int segno, bool use_section) { if (use_section && __is_large_section(sbi)) { unsigned int start_segno = START_SEGNO(segno); unsigned int blocks = 0; int i; for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) { struct seg_entry *se = get_seg_entry(sbi, start_segno); blocks += se->ckpt_valid_blocks; } return blocks; } return get_seg_entry(sbi, segno)->ckpt_valid_blocks; } static inline void seg_info_from_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { se->valid_blocks = GET_SIT_VBLOCKS(rs); se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); #ifdef CONFIG_F2FS_CHECK_FS memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); #endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } static inline void __seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) | se->valid_blocks; rs->vblocks = cpu_to_le16(raw_vblocks); memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); rs->mtime = cpu_to_le64(se->mtime); } static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, struct page *page, unsigned int start) { struct f2fs_sit_block *raw_sit; struct seg_entry *se; struct f2fs_sit_entry *rs; unsigned int end = min(start + SIT_ENTRY_PER_BLOCK, (unsigned long)MAIN_SEGS(sbi)); int i; raw_sit = (struct f2fs_sit_block *)page_address(page); memset(raw_sit, 0, PAGE_SIZE); for (i = 0; i < end - start; i++) { rs = &raw_sit->entries[i]; se = get_seg_entry(sbi, start + i); __seg_info_to_raw_sit(se, rs); } } static inline void seg_info_to_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { __seg_info_to_raw_sit(se, rs); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); se->ckpt_valid_blocks = se->valid_blocks; } static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, unsigned int max, unsigned int segno) { unsigned int ret; spin_lock(&free_i->segmap_lock); ret = find_next_bit(free_i->free_segmap, max, segno); spin_unlock(&free_i->segmap_lock); return ret; } static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); free_i->free_segments++; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + usable_segs) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } spin_unlock(&free_i->segmap_lock); } static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) free_i->free_sections--; } static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno, bool inmem) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi, segno); spin_lock(&free_i->segmap_lock); if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; if (!inmem && IS_CURSEC(sbi, secno)) goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + usable_segs) { if (test_and_clear_bit(secno, free_i->free_secmap)) free_i->free_sections++; } } skip_free: spin_unlock(&free_i->segmap_lock); } static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) free_i->free_sections--; } spin_unlock(&free_i->segmap_lock); } static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, sit_i->bitmap_size)) f2fs_bug_on(sbi, 1); #endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } static inline block_t written_block_count(struct f2fs_sb_info *sbi) { return SIT_I(sbi)->written_valid_blocks; } static inline unsigned int free_segments(struct f2fs_sb_info *sbi) { return FREE_I(sbi)->free_segments; } static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->reserved_segments + SM_I(sbi)->additional_reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) { return FREE_I(sbi)->free_sections; } static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi) { return DIRTY_I(sbi)->nr_dirty[PRE]; } static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi) { return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] + DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] + DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] + DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE]; } static inline int overprovision_segments(struct f2fs_sb_info *sbi) { return SM_I(sbi)->ovp_segments; } static inline int reserved_sections(struct f2fs_sb_info *sbi) { return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int dent_blocks) { unsigned int segno, left_blocks; int i; /* check current node segment */ for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { segno = CURSEG_I(sbi, i)->segno; left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (node_blocks > left_blocks) return false; } /* check current data segment */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; left_blocks = f2fs_usable_blks_in_seg(sbi, segno) - get_seg_entry(sbi, segno)->ckpt_valid_blocks; if (dent_blocks > left_blocks) return false; return true; } /* * calculate needed sections for dirty node/dentry * and call has_curseg_enough_space */ static inline void __get_secs_required(struct f2fs_sb_info *sbi, unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) { unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); if (lower_p) *lower_p = node_secs + dent_secs; if (upper_p) *upper_p = node_secs + dent_secs + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); if (curseg_p) *curseg_p = has_curseg_enough_space(sbi, node_blocks, dent_blocks); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { unsigned int free_secs, lower_secs, upper_secs; bool curseg_space; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; __get_secs_required(sbi, &lower_secs, &upper_secs, &curseg_space); free_secs = free_sections(sbi) + freed; lower_secs += needed + reserved_sections(sbi); upper_secs += needed + reserved_sections(sbi); if (free_secs > upper_secs) return false; else if (free_secs <= lower_secs) return true; return !curseg_space; } static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { return !has_not_enough_free_secs(sbi, freed, needed); } static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) { if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (likely(has_enough_free_secs(sbi, 0, 0))) return true; return false; } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) { return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; } static inline int utilization(struct f2fs_sb_info *sbi) { return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count); } /* * Sometimes f2fs may be better to drop out-of-place update policy. * And, users can control the policy through sysfs entries. * There are five policies with triggering conditions as follows. * F2FS_IPU_FORCE - all the time, * F2FS_IPU_SSR - if SSR mode is activated, * F2FS_IPU_UTIL - if FS utilization is over threashold, * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over * threashold, * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash * storages. IPU will be triggered only if the # of dirty * pages over min_fsync_blocks. (=default option) * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. * F2FS_IPU_NOCACHE - disable IPU bio cache. * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has * FI_OPU_WRITE flag. * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 #define DEF_MIN_HOT_BLOCKS 16 #define SMALL_VOLUME_SEGMENTS (16 * 512) /* 16GB */ #define F2FS_IPU_DISABLE 0 /* Modification on enum should be synchronized with ipu_mode_names array */ enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, F2FS_IPU_NOCACHE, F2FS_IPU_HONOR_OPU_WRITE, F2FS_IPU_MAX, }; static inline bool IS_F2FS_IPU_DISABLE(struct f2fs_sb_info *sbi) { return SM_I(sbi)->ipu_policy == F2FS_IPU_DISABLE; } #define F2FS_IPU_POLICY(name) \ static inline bool IS_##name(struct f2fs_sb_info *sbi) \ { \ return SM_I(sbi)->ipu_policy & BIT(name); \ } F2FS_IPU_POLICY(F2FS_IPU_FORCE); F2FS_IPU_POLICY(F2FS_IPU_SSR); F2FS_IPU_POLICY(F2FS_IPU_UTIL); F2FS_IPU_POLICY(F2FS_IPU_SSR_UTIL); F2FS_IPU_POLICY(F2FS_IPU_FSYNC); F2FS_IPU_POLICY(F2FS_IPU_ASYNC); F2FS_IPU_POLICY(F2FS_IPU_NOCACHE); F2FS_IPU_POLICY(F2FS_IPU_HONOR_OPU_WRITE); static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); return curseg->segno; } static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); return curseg->alloc_type; } static inline bool valid_main_segno(struct f2fs_sb_info *sbi, unsigned int segno) { return segno <= (MAIN_SEGS(sbi) - 1); } static inline void verify_fio_blkaddr(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; if (__is_valid_data_blkaddr(fio->old_blkaddr)) verify_blkaddr(sbi, fio->old_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC); verify_blkaddr(sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC_ENHANCE); } /* * Summary block is always treated as an invalid block */ static inline int check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; unsigned int usable_blks_per_seg = f2fs_usable_blks_in_seg(sbi, segno); /* check bitmap with valid block count */ do { if (is_valid) { next_pos = find_next_zero_bit_le(&raw_sit->valid_map, usable_blks_per_seg, cur_pos); valid_blocks += next_pos - cur_pos; } else next_pos = find_next_bit_le(&raw_sit->valid_map, usable_blks_per_seg, cur_pos); cur_pos = next_pos; is_valid = !is_valid; } while (cur_pos < usable_blks_per_seg); if (unlikely(GET_SIT_VBLOCKS(raw_sit) != valid_blocks)) { f2fs_err(sbi, "Mismatch valid blocks %d vs. %d", GET_SIT_VBLOCKS(raw_sit), valid_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } if (usable_blks_per_seg < sbi->blocks_per_seg) f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map, sbi->blocks_per_seg, usable_blks_per_seg) != sbi->blocks_per_seg); /* check segment usage, and check boundary of a given segment number */ if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg || !valid_main_segno(sbi, segno))) { f2fs_err(sbi, "Wrong valid blocks %d or segno %u", GET_SIT_VBLOCKS(raw_sit), segno); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SIT); return -EFSCORRUPTED; } return 0; } static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); unsigned int offset = SIT_BLOCK_OFFSET(start); block_t blk_addr = sit_i->sit_base_addr + offset; f2fs_bug_on(sbi, !valid_main_segno(sbi, start)); #ifdef CONFIG_F2FS_CHECK_FS if (f2fs_test_bit(offset, sit_i->sit_bitmap) != f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) f2fs_bug_on(sbi, 1); #endif /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; return blk_addr; } static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, pgoff_t block_addr) { struct sit_info *sit_i = SIT_I(sbi); block_addr -= sit_i->sit_base_addr; if (block_addr < sit_i->sit_blocks) block_addr += sit_i->sit_blocks; else block_addr -= sit_i->sit_blocks; return block_addr + sit_i->sit_base_addr; } static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) { unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); #endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, bool base_time) { struct sit_info *sit_i = SIT_I(sbi); time64_t diff, now = ktime_get_boottime_seconds(); if (now >= sit_i->mounted_time) return sit_i->elapsed_time + now - sit_i->mounted_time; /* system time is set to the past */ if (!base_time) { diff = sit_i->mounted_time - now; if (sit_i->elapsed_time >= diff) return sit_i->elapsed_time - diff; return 0; } return sit_i->elapsed_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, unsigned int ofs_in_node, unsigned char version) { sum->nid = cpu_to_le32(nid); sum->ofs_in_node = cpu_to_le16(ofs_in_node); sum->version = version; } static inline block_t start_sum_block(struct f2fs_sb_info *sbi) { return __start_cp_addr(sbi) + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) { return __start_cp_addr(sbi) + le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) - (base + 1) + type; } static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) return true; return false; } /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, * 512 pages (2MB) * 8 for nodes, and * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) return 8 * BIO_MAX_VECS; else return 0; } /* * When writing pages, it'd better align nr_to_write for segment size. */ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, struct writeback_control *wbc) { long nr_to_write, desired; if (wbc->sync_mode != WB_SYNC_NONE) return 0; nr_to_write = wbc->nr_to_write; desired = BIO_MAX_VECS; if (type == NODE) desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; } static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; bool wakeup = false; int i; if (force) goto wake_up; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (i + 1 < dcc->discard_granularity) break; if (!list_empty(&dcc->pend_list[i])) { wakeup = true; break; } } mutex_unlock(&dcc->cmd_lock); if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: dcc->discard_wake = true; wake_up_interruptible_all(&dcc->discard_wait_queue); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 // SPDX-License-Identifier: GPL-2.0-or-later /* * userdlm.c * * Code which implements the kernel side of a minimal userspace * interface to our DLM. * * Many of the functions here are pared down versions of dlmglue.c * functions. * * Copyright (C) 2003, 2004 Oracle. All rights reserved. */ #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/crc32.h> #include "../ocfs2_lockingver.h" #include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "../cluster/masklog.h" static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) { return container_of(lksb, struct user_lock_res, l_lksb); } static inline int user_check_wait_flag(struct user_lock_res *lockres, int flag) { int ret; spin_lock(&lockres->l_lock); ret = lockres->l_flags & flag; spin_unlock(&lockres->l_lock); return ret; } static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) { wait_event(lockres->l_event, !user_check_wait_flag(lockres, USER_LOCK_BUSY)); } static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) { wait_event(lockres->l_event, !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); } /* I heart container_of... */ static inline struct ocfs2_cluster_connection * cluster_connection_from_user_lockres(struct user_lock_res *lockres) { struct dlmfs_inode_private *ip; ip = container_of(lockres, struct dlmfs_inode_private, ip_lockres); return ip->ip_conn; } static struct inode * user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) { struct dlmfs_inode_private *ip; ip = container_of(lockres, struct dlmfs_inode_private, ip_lockres); return &ip->ip_vfs_inode; } static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) { spin_lock(&lockres->l_lock); lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); } #define user_log_dlm_error(_func, _stat, _lockres) do { \ mlog(ML_ERROR, "Dlm error %d while calling %s on " \ "resource %.*s\n", _stat, _func, \ _lockres->l_namelen, _lockres->l_name); \ } while (0) /* WARNING: This function lives in a world where the only three lock * levels are EX, PR, and NL. It *will* have to be adjusted when more * lock types are added. */ static inline int user_highest_compat_lock_level(int level) { int new_level = DLM_LOCK_EX; if (level == DLM_LOCK_EX) new_level = DLM_LOCK_NL; else if (level == DLM_LOCK_PR) new_level = DLM_LOCK_PR; return new_level; } static void user_ast(struct ocfs2_dlm_lksb *lksb) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); int status; mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n", lockres->l_namelen, lockres->l_name, lockres->l_level, lockres->l_requested); spin_lock(&lockres->l_lock); status = ocfs2_dlm_lock_status(&lockres->l_lksb); if (status) { mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", status, lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); return; } mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV, "Lockres %.*s, requested ivmode. flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= user_highest_compat_lock_level(lockres->l_blocking)) { lockres->l_blocking = DLM_LOCK_NL; lockres->l_flags &= ~USER_LOCK_BLOCKED; } } lockres->l_level = lockres->l_requested; lockres->l_requested = DLM_LOCK_IV; lockres->l_flags |= USER_LOCK_ATTACHED; lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) { struct inode *inode; inode = user_dlm_inode_from_user_lockres(lockres); if (!igrab(inode)) BUG(); } static void user_dlm_unblock_lock(struct work_struct *work); static void __user_dlm_queue_lockres(struct user_lock_res *lockres) { if (!(lockres->l_flags & USER_LOCK_QUEUED)) { user_dlm_grab_inode_ref(lockres); INIT_WORK(&lockres->l_work, user_dlm_unblock_lock); queue_work(user_dlm_worker, &lockres->l_work); lockres->l_flags |= USER_LOCK_QUEUED; } } static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) { int queue = 0; if (!(lockres->l_flags & USER_LOCK_BLOCKED)) return; switch (lockres->l_blocking) { case DLM_LOCK_EX: if (!lockres->l_ex_holders && !lockres->l_ro_holders) queue = 1; break; case DLM_LOCK_PR: if (!lockres->l_ex_holders) queue = 1; break; default: BUG(); } if (queue) __user_dlm_queue_lockres(lockres); } static void user_bast(struct ocfs2_dlm_lksb *lksb, int level) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n", lockres->l_namelen, lockres->l_name, level, lockres->l_level); spin_lock(&lockres->l_lock); lockres->l_flags |= USER_LOCK_BLOCKED; if (level > lockres->l_blocking) lockres->l_blocking = level; __user_dlm_queue_lockres(lockres); spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); if (status) mlog(ML_ERROR, "dlm returns status %d\n", status); spin_lock(&lockres->l_lock); /* The teardown flag gets set early during the unlock process, * so test the cancel flag to make sure that this ast isn't * for a concurrent cancel. */ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = DLM_LOCK_IV; } else if (status == DLM_CANCELGRANT) { /* We tried to cancel a convert request, but it was * already granted. Don't clear the busy flag - the * ast should've done this already. */ BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); lockres->l_flags &= ~USER_LOCK_IN_CANCEL; goto out_noclear; } else { BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); /* Cancel succeeded, we want to re-queue */ lockres->l_requested = DLM_LOCK_IV; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; /* we want the unblock thread to look at it again * now. */ if (lockres->l_flags & USER_LOCK_BLOCKED) __user_dlm_queue_lockres(lockres); } lockres->l_flags &= ~USER_LOCK_BUSY; out_noclear: spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } /* * This is the userdlmfs locking protocol version. * * See fs/ocfs2/dlmglue.c for more details on locking versions. */ static struct ocfs2_locking_protocol user_dlm_lproto = { .lp_max_version = { .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, }, .lp_lock_ast = user_ast, .lp_blocking_ast = user_bast, .lp_unlock_ast = user_unlock_ast, }; static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) { struct inode *inode; inode = user_dlm_inode_from_user_lockres(lockres); iput(inode); } static void user_dlm_unblock_lock(struct work_struct *work) { int new_level, status; struct user_lock_res *lockres = container_of(work, struct user_lock_res, l_work); struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), "Lockres %.*s, flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* notice that we don't clear USER_LOCK_BLOCKED here. If it's * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; /* It's valid to get here and no longer be blocked - if we get * several basts in a row, we might be queued by the first * one, the unblock thread might run and clear the queued * flag, and finally we might get another bast which re-queues * us before our ast for the downconvert is called. */ if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_BUSY) { if (lockres->l_flags & USER_LOCK_IN_CANCEL) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } lockres->l_flags |= USER_LOCK_IN_CANCEL; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_CANCEL); if (status) user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto drop_ref; } /* If there are still incompat holders, we can exit safely * without worrying about re-queueing this lock as that will * happen on the last call to user_cluster_unlock. */ if ((lockres->l_blocking == DLM_LOCK_EX) && (lockres->l_ex_holders || lockres->l_ro_holders)) { spin_unlock(&lockres->l_lock); mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n", lockres->l_namelen, lockres->l_name, lockres->l_ex_holders, lockres->l_ro_holders); goto drop_ref; } if ((lockres->l_blocking == DLM_LOCK_PR) && lockres->l_ex_holders) { spin_unlock(&lockres->l_lock); mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n", lockres->l_namelen, lockres->l_name, lockres->l_ex_holders); goto drop_ref; } /* yay, we can downconvert now. */ new_level = user_highest_compat_lock_level(lockres->l_blocking); lockres->l_requested = new_level; lockres->l_flags |= USER_LOCK_BUSY; mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n", lockres->l_namelen, lockres->l_name, lockres->l_level, new_level); spin_unlock(&lockres->l_lock); /* need lock downconvert request now... */ status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb, DLM_LKF_CONVERT|DLM_LKF_VALBLK, lockres->l_name, lockres->l_namelen); if (status) { user_log_dlm_error("ocfs2_dlm_lock", status, lockres); user_recover_from_dlm_error(lockres); } drop_ref: user_dlm_drop_inode_ref(lockres); } static inline void user_dlm_inc_holders(struct user_lock_res *lockres, int level) { switch(level) { case DLM_LOCK_EX: lockres->l_ex_holders++; break; case DLM_LOCK_PR: lockres->l_ro_holders++; break; default: BUG(); } } /* predict what lock level we'll be dropping down to on behalf * of another node, and return true if the currently wanted * level will be compatible with it. */ static inline int user_may_continue_on_blocked_lock(struct user_lock_res *lockres, int wanted) { BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); return wanted <= user_highest_compat_lock_level(lockres->l_blocking); } int user_dlm_cluster_lock(struct user_lock_res *lockres, int level, int lkm_flags) { int status, local_flags; struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); if (level != DLM_LOCK_EX && level != DLM_LOCK_PR) { mlog(ML_ERROR, "lockres %.*s: invalid request!\n", lockres->l_namelen, lockres->l_name); status = -EINVAL; goto bail; } mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n", lockres->l_namelen, lockres->l_name, level, lkm_flags); again: if (signal_pending(current)) { status = -ERESTARTSYS; goto bail; } spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); status = -EAGAIN; goto bail; } /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, * we'll get caught below. */ if ((lockres->l_flags & USER_LOCK_BUSY) && (level > lockres->l_level)) { /* is someone sitting in dlm_lock? If so, wait on * them. */ spin_unlock(&lockres->l_lock); user_wait_on_busy_lock(lockres); goto again; } if ((lockres->l_flags & USER_LOCK_BLOCKED) && (!user_may_continue_on_blocked_lock(lockres, level))) { /* is the lock is currently blocked on behalf of * another node */ spin_unlock(&lockres->l_lock); user_wait_on_blocked_lock(lockres); goto again; } if (level > lockres->l_level) { local_flags = lkm_flags | DLM_LKF_VALBLK; if (lockres->l_level != DLM_LOCK_IV) local_flags |= DLM_LKF_CONVERT; lockres->l_requested = level; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); BUG_ON(level == DLM_LOCK_IV); BUG_ON(level == DLM_LOCK_NL); /* call dlm_lock to upgrade lock now */ status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb, local_flags, lockres->l_name, lockres->l_namelen); if (status) { if ((lkm_flags & DLM_LKF_NOQUEUE) && (status != -EAGAIN)) user_log_dlm_error("ocfs2_dlm_lock", status, lockres); user_recover_from_dlm_error(lockres); goto bail; } user_wait_on_busy_lock(lockres); goto again; } user_dlm_inc_holders(lockres, level); spin_unlock(&lockres->l_lock); status = 0; bail: return status; } static inline void user_dlm_dec_holders(struct user_lock_res *lockres, int level) { switch(level) { case DLM_LOCK_EX: BUG_ON(!lockres->l_ex_holders); lockres->l_ex_holders--; break; case DLM_LOCK_PR: BUG_ON(!lockres->l_ro_holders); lockres->l_ro_holders--; break; default: BUG(); } } void user_dlm_cluster_unlock(struct user_lock_res *lockres, int level) { if (level != DLM_LOCK_EX && level != DLM_LOCK_PR) { mlog(ML_ERROR, "lockres %.*s: invalid request!\n", lockres->l_namelen, lockres->l_name); return; } spin_lock(&lockres->l_lock); user_dlm_dec_holders(lockres, level); __user_dlm_cond_queue_lockres(lockres); spin_unlock(&lockres->l_lock); } void user_dlm_write_lvb(struct inode *inode, const char *val, unsigned int len) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; char *lvb; BUG_ON(len > DLM_LVB_LEN); spin_lock(&lockres->l_lock); BUG_ON(lockres->l_level < DLM_LOCK_EX); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); memcpy(lvb, val, len); spin_unlock(&lockres->l_lock); } bool user_dlm_read_lvb(struct inode *inode, char *val) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; char *lvb; bool ret = true; spin_lock(&lockres->l_lock); BUG_ON(lockres->l_level < DLM_LOCK_PR); if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) { lvb = ocfs2_dlm_lvb(&lockres->l_lksb); memcpy(val, lvb, DLM_LVB_LEN); } else ret = false; spin_unlock(&lockres->l_lock); return ret; } void user_dlm_lock_res_init(struct user_lock_res *lockres, struct dentry *dentry) { memset(lockres, 0, sizeof(*lockres)); spin_lock_init(&lockres->l_lock); init_waitqueue_head(&lockres->l_event); lockres->l_level = DLM_LOCK_IV; lockres->l_requested = DLM_LOCK_IV; lockres->l_blocking = DLM_LOCK_IV; /* should have been checked before getting here. */ BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); memcpy(lockres->l_name, dentry->d_name.name, dentry->d_name.len); lockres->l_namelen = dentry->d_name.len; } int user_dlm_destroy_lock(struct user_lock_res *lockres) { int status = -EBUSY; struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); goto bail; } lockres->l_flags |= USER_LOCK_IN_TEARDOWN; while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); user_wait_on_busy_lock(lockres); spin_lock(&lockres->l_lock); } if (lockres->l_ro_holders || lockres->l_ex_holders) { lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { /* * lock is never requested, leave USER_LOCK_IN_TEARDOWN set * to avoid new lock request coming in. */ spin_unlock(&lockres->l_lock); goto bail; } lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { spin_lock(&lockres->l_lock); lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } user_wait_on_busy_lock(lockres); status = 0; bail: return status; } static void user_dlm_recovery_handler_noop(int node_num, void *recovery_data) { /* We ignore recovery events */ return; } void user_dlm_set_locking_protocol(void) { ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); } struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name) { int rc; struct ocfs2_cluster_connection *conn; rc = ocfs2_cluster_connect_agnostic(name->name, name->len, &user_dlm_lproto, user_dlm_recovery_handler_noop, NULL, &conn); if (rc) mlog_errno(rc); return rc ? ERR_PTR(rc) : conn; } void user_dlm_unregister(struct ocfs2_cluster_connection *conn) { ocfs2_cluster_disconnect(conn, 0); }
3 3 3 3 3 3 3 7 15 19 7 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "accessors.h" #include "dir-item.h" /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On * success a struct btrfs_dir_item pointer is returned, otherwise it is * an ERR_PTR. * * The name is not copied into the dir item, you have to do that yourself. */ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *cpu_key, u32 data_size, const char *name, int name_len) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; struct extent_buffer *leaf; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (di) return ERR_PTR(-EEXIST); btrfs_extend_item(trans, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; ptr = btrfs_item_ptr(leaf, path->slots[0], char); ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0])); ptr += btrfs_item_size(leaf, path->slots[0]) - data_size; return (struct btrfs_dir_item *)ptr; } /* * xattrs work a lot like directories, this inserts an xattr item * into the tree */ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, const char *name, u16 name_len, const void *data, u16 data_len) { int ret = 0; struct btrfs_dir_item *dir_item; unsigned long name_ptr, data_ptr; struct btrfs_key key, location; struct btrfs_disk_key disk_key; struct extent_buffer *leaf; u32 data_size; if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) return -ENOSPC; key.objectid = objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); if (IS_ERR(dir_item)) return PTR_ERR(dir_item); memset(&location, 0, sizeof(location)); leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, &location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR); btrfs_set_dir_name_len(leaf, dir_item, name_len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); btrfs_set_dir_data_len(leaf, dir_item, data_len); name_ptr = (unsigned long)(dir_item + 1); data_ptr = (unsigned long)((char *)name_ptr + name_len); write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } /* * insert a directory item in the tree, doing all the magic for * both indexes. 'dir' indicates which objectid to insert it into, * 'location' is the key to stuff into the directory item, 'type' is the * type of the inode we're pointing to, and 'index' is the sequence number * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; int ret2 = 0; struct btrfs_root *root = dir->root; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; unsigned long name_ptr; struct btrfs_key key; struct btrfs_disk_key disk_key; u32 data_size; key.objectid = btrfs_ino(dir); key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); path = btrfs_alloc_path(); if (!path) return -ENOMEM; btrfs_cpu_key_to_disk(&disk_key, location); data_size = sizeof(*dir_item) + name->len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name->name, name->len); if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); if (ret == -EEXIST) goto second_insert; goto out_free; } if (IS_ENCRYPTED(&dir->vfs_inode)) type |= BTRFS_FT_ENCRYPTED; leaf = path->nodes[0]; btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_flags(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ if (root == root->fs_info->tree_root) { ret = 0; goto out_free; } btrfs_release_path(path); ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); if (ret) return ret; if (ret2) return ret2; return 0; } static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, const char *name, int name_len, int mod) { const int ins_len = (mod < 0 ? -1 : 0); const int cow = (mod != 0); int ret; ret = btrfs_search_slot(trans, root, key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); } /* * Lookup for a directory item by name. * * @trans: The transaction handle to use. Can be NULL if @mod is 0. * @root: The root of the target tree. * @path: Path to use for the search. * @dir: The inode number (objectid) of the directory. * @name: The name associated to the directory entry we are looking for. * @name_len: The length of the name. * @mod: Used to indicate if the tree search is meant for a read only * lookup, for a modification lookup or for a deletion lookup, so * its value should be 0, 1 or -1, respectively. * * Returns: NULL if the dir item does not exists, an error pointer if an error * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const struct fscrypt_str *name, int mod) { struct btrfs_key key; struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, name->len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; return di; } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name) { int ret; struct btrfs_key key; struct btrfs_dir_item *di; int data_size; struct extent_buffer *leaf; int slot; struct btrfs_path *path; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name, name->len, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ if (ret == -ENOENT) { ret = 0; goto out; } if (ret < 0) goto out; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ ret = -EEXIST; goto out; } /* See if there is room in the item to insert this name. */ data_size = sizeof(*di) + name->len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { ret = -EOVERFLOW; } else { /* plenty of insertion room */ ret = 0; } out: btrfs_free_path(path); return ret; } /* * Lookup for a directory index item by name and index number. * * @trans: The transaction handle to use. Can be NULL if @mod is 0. * @root: The root of the target tree. * @path: Path to use for the search. * @dir: The inode number (objectid) of the directory. * @index: The index number. * @name: The name associated to the directory entry we are looking for. * @name_len: The length of the name. * @mod: Used to indicate if the tree search is meant for a read only * lookup, for a modification lookup or for a deletion lookup, so * its value should be 0, 1 or -1, respectively. * * Returns: NULL if the dir index item does not exists, an error pointer if an * error happened, or a pointer to a dir item if the dir index item exists and * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, u64 index, const struct fscrypt_str *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = index; di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, name->len, mod); if (di == ERR_PTR(-ENOENT)) return NULL; return di; } struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; key.objectid = dirid; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; btrfs_for_each_slot(root, &key, &key, path, ret) { if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; di = btrfs_match_dir_item_name(root->fs_info, path, name->name, name->len); if (di) return di; } /* Adjust return code if the key was not found in the next leaf. */ if (ret > 0) ret = 0; return ERR_PTR(ret); } struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod) { struct btrfs_key key; struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; return di; } /* * helper function to look at the directory item pointed to by 'path' * this walks through all the entries in a dir item and finds one * for a specific name. */ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; unsigned long name_ptr; u32 total_len; u32 cur = 0; u32 this_len; struct extent_buffer *leaf; leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); total_len = btrfs_item_size(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + btrfs_dir_name_len(leaf, dir_item) + btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; cur += this_len; dir_item = (struct btrfs_dir_item *)((char *)dir_item + this_len); } return NULL; } /* * given a pointer into a directory item, delete it. This * handles items that have more than one entry in them. */ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di) { struct extent_buffer *leaf; u32 sub_item_len; u32 item_len; int ret = 0; leaf = path->nodes[0]; sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di); item_len = btrfs_item_size(leaf, path->slots[0]); if (sub_item_len == item_len) { ret = btrfs_del_item(trans, root, path); } else { /* MARKER */ unsigned long ptr = (unsigned long)di; unsigned long start; start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); btrfs_truncate_item(trans, path, item_len - sub_item_len, 1); } return ret; }
332 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 // SPDX-License-Identifier: GPL-2.0 /* * This is a maximally equidistributed combined Tausworthe generator * based on code from GNU Scientific Library 1.5 (30 Jun 2004) * * lfsr113 version: * * x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n) * * s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n << 6) ^ s1_n) >> 13)) * s2_{n+1} = (((s2_n & 4294967288) << 2) ^ (((s2_n << 2) ^ s2_n) >> 27)) * s3_{n+1} = (((s3_n & 4294967280) << 7) ^ (((s3_n << 13) ^ s3_n) >> 21)) * s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n << 3) ^ s4_n) >> 12)) * * The period of this generator is about 2^113 (see erratum paper). * * From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe * Generators", Mathematics of Computation, 65, 213 (1996), 203--213: * http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps * ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps * * There is an erratum in the paper "Tables of Maximally Equidistributed * Combined LFSR Generators", Mathematics of Computation, 68, 225 (1999), * 261--269: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps * * ... the k_j most significant bits of z_j must be non-zero, * for each j. (Note: this restriction also applies to the * computer code given in [4], but was mistakenly not mentioned * in that paper.) * * This affects the seeding procedure by imposing the requirement * s1 > 1, s2 > 7, s3 > 15, s4 > 127. */ #include <linux/types.h> #include <linux/percpu.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/bitops.h> #include <linux/slab.h> #include <asm/unaligned.h> /** * prandom_u32_state - seeded pseudo-random number generator. * @state: pointer to state structure holding seeded state. * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_u32(). */ u32 prandom_u32_state(struct rnd_state *state) { #define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b) state->s1 = TAUSWORTHE(state->s1, 6U, 13U, 4294967294U, 18U); state->s2 = TAUSWORTHE(state->s2, 2U, 27U, 4294967288U, 2U); state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U, 7U); state->s4 = TAUSWORTHE(state->s4, 3U, 12U, 4294967168U, 13U); return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4); } EXPORT_SYMBOL(prandom_u32_state); /** * prandom_bytes_state - get the requested number of pseudo-random bytes * * @state: pointer to state structure holding seeded state. * @buf: where to copy the pseudo-random bytes to * @bytes: the requested number of bytes * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_bytes(). */ void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { u8 *ptr = buf; while (bytes >= sizeof(u32)) { put_unaligned(prandom_u32_state(state), (u32 *) ptr); ptr += sizeof(u32); bytes -= sizeof(u32); } if (bytes > 0) { u32 rem = prandom_u32_state(state); do { *ptr++ = (u8) rem; bytes--; rem >>= BITS_PER_BYTE; } while (bytes > 0); } } EXPORT_SYMBOL(prandom_bytes_state); static void prandom_warmup(struct rnd_state *state) { /* Calling RNG ten times to satisfy recurrence condition */ prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); } void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state) { int i; for_each_possible_cpu(i) { struct rnd_state *state = per_cpu_ptr(pcpu_state, i); u32 seeds[4]; get_random_bytes(&seeds, sizeof(seeds)); state->s1 = __seed(seeds[0], 2U); state->s2 = __seed(seeds[1], 8U); state->s3 = __seed(seeds[2], 16U); state->s4 = __seed(seeds[3], 128U); prandom_warmup(state); } } EXPORT_SYMBOL(prandom_seed_full_state); #ifdef CONFIG_RANDOM32_SELFTEST static struct prandom_test1 { u32 seed; u32 result; } test1[] = { { 1U, 3484351685U }, { 2U, 2623130059U }, { 3U, 3125133893U }, { 4U, 984847254U }, }; static struct prandom_test2 { u32 seed; u32 iteration; u32 result; } test2[] = { /* Test cases against taus113 from GSL library. */ { 931557656U, 959U, 2975593782U }, { 1339693295U, 876U, 3887776532U }, { 1545556285U, 961U, 1615538833U }, { 601730776U, 723U, 1776162651U }, { 1027516047U, 687U, 511983079U }, { 416526298U, 700U, 916156552U }, { 1395522032U, 652U, 2222063676U }, { 366221443U, 617U, 2992857763U }, { 1539836965U, 714U, 3783265725U }, { 556206671U, 994U, 799626459U }, { 684907218U, 799U, 367789491U }, { 2121230701U, 931U, 2115467001U }, { 1668516451U, 644U, 3620590685U }, { 768046066U, 883U, 2034077390U }, { 1989159136U, 833U, 1195767305U }, { 536585145U, 996U, 3577259204U }, { 1008129373U, 642U, 1478080776U }, { 1740775604U, 939U, 1264980372U }, { 1967883163U, 508U, 10734624U }, { 1923019697U, 730U, 3821419629U }, { 442079932U, 560U, 3440032343U }, { 1961302714U, 845U, 841962572U }, { 2030205964U, 962U, 1325144227U }, { 1160407529U, 507U, 240940858U }, { 635482502U, 779U, 4200489746U }, { 1252788931U, 699U, 867195434U }, { 1961817131U, 719U, 668237657U }, { 1071468216U, 983U, 917876630U }, { 1281848367U, 932U, 1003100039U }, { 582537119U, 780U, 1127273778U }, { 1973672777U, 853U, 1071368872U }, { 1896756996U, 762U, 1127851055U }, { 847917054U, 500U, 1717499075U }, { 1240520510U, 951U, 2849576657U }, { 1685071682U, 567U, 1961810396U }, { 1516232129U, 557U, 3173877U }, { 1208118903U, 612U, 1613145022U }, { 1817269927U, 693U, 4279122573U }, { 1510091701U, 717U, 638191229U }, { 365916850U, 807U, 600424314U }, { 399324359U, 702U, 1803598116U }, { 1318480274U, 779U, 2074237022U }, { 697758115U, 840U, 1483639402U }, { 1696507773U, 840U, 577415447U }, { 2081979121U, 981U, 3041486449U }, { 955646687U, 742U, 3846494357U }, { 1250683506U, 749U, 836419859U }, { 595003102U, 534U, 366794109U }, { 47485338U, 558U, 3521120834U }, { 619433479U, 610U, 3991783875U }, { 704096520U, 518U, 4139493852U }, { 1712224984U, 606U, 2393312003U }, { 1318233152U, 922U, 3880361134U }, { 855572992U, 761U, 1472974787U }, { 64721421U, 703U, 683860550U }, { 678931758U, 840U, 380616043U }, { 692711973U, 778U, 1382361947U }, { 677703619U, 530U, 2826914161U }, { 92393223U, 586U, 1522128471U }, { 1222592920U, 743U, 3466726667U }, { 358288986U, 695U, 1091956998U }, { 1935056945U, 958U, 514864477U }, { 735675993U, 990U, 1294239989U }, { 1560089402U, 897U, 2238551287U }, { 70616361U, 829U, 22483098U }, { 368234700U, 731U, 2913875084U }, { 20221190U, 879U, 1564152970U }, { 539444654U, 682U, 1835141259U }, { 1314987297U, 840U, 1801114136U }, { 2019295544U, 645U, 3286438930U }, { 469023838U, 716U, 1637918202U }, { 1843754496U, 653U, 2562092152U }, { 400672036U, 809U, 4264212785U }, { 404722249U, 965U, 2704116999U }, { 600702209U, 758U, 584979986U }, { 519953954U, 667U, 2574436237U }, { 1658071126U, 694U, 2214569490U }, { 420480037U, 749U, 3430010866U }, { 690103647U, 969U, 3700758083U }, { 1029424799U, 937U, 3787746841U }, { 2012608669U, 506U, 3362628973U }, { 1535432887U, 998U, 42610943U }, { 1330635533U, 857U, 3040806504U }, { 1223800550U, 539U, 3954229517U }, { 1322411537U, 680U, 3223250324U }, { 1877847898U, 945U, 2915147143U }, { 1646356099U, 874U, 965988280U }, { 805687536U, 744U, 4032277920U }, { 1948093210U, 633U, 1346597684U }, { 392609744U, 783U, 1636083295U }, { 690241304U, 770U, 1201031298U }, { 1360302965U, 696U, 1665394461U }, { 1220090946U, 780U, 1316922812U }, { 447092251U, 500U, 3438743375U }, { 1613868791U, 592U, 828546883U }, { 523430951U, 548U, 2552392304U }, { 726692899U, 810U, 1656872867U }, { 1364340021U, 836U, 3710513486U }, { 1986257729U, 931U, 935013962U }, { 407983964U, 921U, 728767059U }, }; static void prandom_state_selftest_seed(struct rnd_state *state, u32 seed) { #define LCG(x) ((x) * 69069U) /* super-duper LCG */ state->s1 = __seed(LCG(seed), 2U); state->s2 = __seed(LCG(state->s1), 8U); state->s3 = __seed(LCG(state->s2), 16U); state->s4 = __seed(LCG(state->s3), 128U); } static int __init prandom_state_selftest(void) { int i, j, errors = 0, runs = 0; bool error = false; for (i = 0; i < ARRAY_SIZE(test1); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test1[i].seed); prandom_warmup(&state); if (test1[i].result != prandom_u32_state(&state)) error = true; } if (error) pr_warn("prandom: seed boundary self test failed\n"); else pr_info("prandom: seed boundary self test passed\n"); for (i = 0; i < ARRAY_SIZE(test2); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test2[i].seed); prandom_warmup(&state); for (j = 0; j < test2[i].iteration - 1; j++) prandom_u32_state(&state); if (test2[i].result != prandom_u32_state(&state)) errors++; runs++; cond_resched(); } if (errors) pr_warn("prandom: %d/%d self tests failed\n", errors, runs); else pr_info("prandom: %d self tests passed\n", runs); return 0; } core_initcall(prandom_state_selftest); #endif
3 8 8 6 5 12 3 3 3 16 13 6 4 11 11 22 4 3 3 3 22 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 /* * DRBG: Deterministic Random Bits Generator * Based on NIST Recommended DRBG from NIST SP800-90A with the following * properties: * * CTR DRBG with DF with AES-128, AES-192, AES-256 cores * * Hash DRBG with DF with SHA-1, SHA-256, SHA-384, SHA-512 cores * * HMAC DRBG with DF with SHA-1, SHA-256, SHA-384, SHA-512 cores * * with and without prediction resistance * * Copyright Stephan Mueller <smueller@chronox.de>, 2014 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * DRBG Usage * ========== * The SP 800-90A DRBG allows the user to specify a personalization string * for initialization as well as an additional information string for each * random number request. The following code fragments show how a caller * uses the kernel crypto API to use the full functionality of the DRBG. * * Usage without any additional data * --------------------------------- * struct crypto_rng *drng; * int err; * char data[DATALEN]; * * drng = crypto_alloc_rng(drng_name, 0, 0); * err = crypto_rng_get_bytes(drng, &data, DATALEN); * crypto_free_rng(drng); * * * Usage with personalization string during initialization * ------------------------------------------------------- * struct crypto_rng *drng; * int err; * char data[DATALEN]; * struct drbg_string pers; * char personalization[11] = "some-string"; * * drbg_string_fill(&pers, personalization, strlen(personalization)); * drng = crypto_alloc_rng(drng_name, 0, 0); * // The reset completely re-initializes the DRBG with the provided * // personalization string * err = crypto_rng_reset(drng, &personalization, strlen(personalization)); * err = crypto_rng_get_bytes(drng, &data, DATALEN); * crypto_free_rng(drng); * * * Usage with additional information string during random number request * --------------------------------------------------------------------- * struct crypto_rng *drng; * int err; * char data[DATALEN]; * char addtl_string[11] = "some-string"; * string drbg_string addtl; * * drbg_string_fill(&addtl, addtl_string, strlen(addtl_string)); * drng = crypto_alloc_rng(drng_name, 0, 0); * // The following call is a wrapper to crypto_rng_get_bytes() and returns * // the same error codes. * err = crypto_drbg_get_bytes_addtl(drng, &data, DATALEN, &addtl); * crypto_free_rng(drng); * * * Usage with personalization and additional information strings * ------------------------------------------------------------- * Just mix both scenarios above. */ #include <crypto/drbg.h> #include <crypto/internal/cipher.h> #include <linux/kernel.h> #include <linux/jiffies.h> /*************************************************************** * Backend cipher definitions available to DRBG ***************************************************************/ /* * The order of the DRBG definitions here matter: every DRBG is registered * as stdrng. Each DRBG receives an increasing cra_priority values the later * they are defined in this array (see drbg_fill_array). * * HMAC DRBGs are favored over Hash DRBGs over CTR DRBGs, and * the SHA256 / AES 256 over other ciphers. Thus, the favored * DRBGs are the latest entries in this array. */ static const struct drbg_core drbg_cores[] = { #ifdef CONFIG_CRYPTO_DRBG_CTR { .flags = DRBG_CTR | DRBG_STRENGTH128, .statelen = 32, /* 256 bits as defined in 10.2.1 */ .blocklen_bytes = 16, .cra_name = "ctr_aes128", .backend_cra_name = "aes", }, { .flags = DRBG_CTR | DRBG_STRENGTH192, .statelen = 40, /* 320 bits as defined in 10.2.1 */ .blocklen_bytes = 16, .cra_name = "ctr_aes192", .backend_cra_name = "aes", }, { .flags = DRBG_CTR | DRBG_STRENGTH256, .statelen = 48, /* 384 bits as defined in 10.2.1 */ .blocklen_bytes = 16, .cra_name = "ctr_aes256", .backend_cra_name = "aes", }, #endif /* CONFIG_CRYPTO_DRBG_CTR */ #ifdef CONFIG_CRYPTO_DRBG_HASH { .flags = DRBG_HASH | DRBG_STRENGTH128, .statelen = 55, /* 440 bits */ .blocklen_bytes = 20, .cra_name = "sha1", .backend_cra_name = "sha1", }, { .flags = DRBG_HASH | DRBG_STRENGTH256, .statelen = 111, /* 888 bits */ .blocklen_bytes = 48, .cra_name = "sha384", .backend_cra_name = "sha384", }, { .flags = DRBG_HASH | DRBG_STRENGTH256, .statelen = 111, /* 888 bits */ .blocklen_bytes = 64, .cra_name = "sha512", .backend_cra_name = "sha512", }, { .flags = DRBG_HASH | DRBG_STRENGTH256, .statelen = 55, /* 440 bits */ .blocklen_bytes = 32, .cra_name = "sha256", .backend_cra_name = "sha256", }, #endif /* CONFIG_CRYPTO_DRBG_HASH */ #ifdef CONFIG_CRYPTO_DRBG_HMAC { .flags = DRBG_HMAC | DRBG_STRENGTH128, .statelen = 20, /* block length of cipher */ .blocklen_bytes = 20, .cra_name = "hmac_sha1", .backend_cra_name = "hmac(sha1)", }, { .flags = DRBG_HMAC | DRBG_STRENGTH256, .statelen = 48, /* block length of cipher */ .blocklen_bytes = 48, .cra_name = "hmac_sha384", .backend_cra_name = "hmac(sha384)", }, { .flags = DRBG_HMAC | DRBG_STRENGTH256, .statelen = 32, /* block length of cipher */ .blocklen_bytes = 32, .cra_name = "hmac_sha256", .backend_cra_name = "hmac(sha256)", }, { .flags = DRBG_HMAC | DRBG_STRENGTH256, .statelen = 64, /* block length of cipher */ .blocklen_bytes = 64, .cra_name = "hmac_sha512", .backend_cra_name = "hmac(sha512)", }, #endif /* CONFIG_CRYPTO_DRBG_HMAC */ }; static int drbg_uninstantiate(struct drbg_state *drbg); /****************************************************************** * Generic helper functions ******************************************************************/ /* * Return strength of DRBG according to SP800-90A section 8.4 * * @flags DRBG flags reference * * Return: normalized strength in *bytes* value or 32 as default * to counter programming errors */ static inline unsigned short drbg_sec_strength(drbg_flag_t flags) { switch (flags & DRBG_STRENGTH_MASK) { case DRBG_STRENGTH128: return 16; case DRBG_STRENGTH192: return 24; case DRBG_STRENGTH256: return 32; default: return 32; } } /* * FIPS 140-2 continuous self test for the noise source * The test is performed on the noise source input data. Thus, the function * implicitly knows the size of the buffer to be equal to the security * strength. * * Note, this function disregards the nonce trailing the entropy data during * initial seeding. * * drbg->drbg_mutex must have been taken. * * @drbg DRBG handle * @entropy buffer of seed data to be checked * * return: * 0 on success * -EAGAIN on when the CTRNG is not yet primed * < 0 on error */ static int drbg_fips_continuous_test(struct drbg_state *drbg, const unsigned char *entropy) { unsigned short entropylen = drbg_sec_strength(drbg->core->flags); int ret = 0; if (!IS_ENABLED(CONFIG_CRYPTO_FIPS)) return 0; /* skip test if we test the overall system */ if (list_empty(&drbg->test_data.list)) return 0; /* only perform test in FIPS mode */ if (!fips_enabled) return 0; if (!drbg->fips_primed) { /* Priming of FIPS test */ memcpy(drbg->prev, entropy, entropylen); drbg->fips_primed = true; /* priming: another round is needed */ return -EAGAIN; } ret = memcmp(drbg->prev, entropy, entropylen); if (!ret) panic("DRBG continuous self test failed\n"); memcpy(drbg->prev, entropy, entropylen); /* the test shall pass when the two values are not equal */ return 0; } /* * Convert an integer into a byte representation of this integer. * The byte representation is big-endian * * @val value to be converted * @buf buffer holding the converted integer -- caller must ensure that * buffer size is at least 32 bit */ #if (defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_CTR)) static inline void drbg_cpu_to_be32(__u32 val, unsigned char *buf) { struct s { __be32 conv; }; struct s *conversion = (struct s *) buf; conversion->conv = cpu_to_be32(val); } #endif /* defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_CTR) */ /****************************************************************** * CTR DRBG callback functions ******************************************************************/ #ifdef CONFIG_CRYPTO_DRBG_CTR #define CRYPTO_DRBG_CTR_STRING "CTR " MODULE_ALIAS_CRYPTO("drbg_pr_ctr_aes256"); MODULE_ALIAS_CRYPTO("drbg_nopr_ctr_aes256"); MODULE_ALIAS_CRYPTO("drbg_pr_ctr_aes192"); MODULE_ALIAS_CRYPTO("drbg_nopr_ctr_aes192"); MODULE_ALIAS_CRYPTO("drbg_pr_ctr_aes128"); MODULE_ALIAS_CRYPTO("drbg_nopr_ctr_aes128"); static void drbg_kcapi_symsetkey(struct drbg_state *drbg, const unsigned char *key); static int drbg_kcapi_sym(struct drbg_state *drbg, unsigned char *outval, const struct drbg_string *in); static int drbg_init_sym_kernel(struct drbg_state *drbg); static int drbg_fini_sym_kernel(struct drbg_state *drbg); static int drbg_kcapi_sym_ctr(struct drbg_state *drbg, u8 *inbuf, u32 inbuflen, u8 *outbuf, u32 outlen); #define DRBG_OUTSCRATCHLEN 256 /* BCC function for CTR DRBG as defined in 10.4.3 */ static int drbg_ctr_bcc(struct drbg_state *drbg, unsigned char *out, const unsigned char *key, struct list_head *in) { int ret = 0; struct drbg_string *curr = NULL; struct drbg_string data; short cnt = 0; drbg_string_fill(&data, out, drbg_blocklen(drbg)); /* 10.4.3 step 2 / 4 */ drbg_kcapi_symsetkey(drbg, key); list_for_each_entry(curr, in, list) { const unsigned char *pos = curr->buf; size_t len = curr->len; /* 10.4.3 step 4.1 */ while (len) { /* 10.4.3 step 4.2 */ if (drbg_blocklen(drbg) == cnt) { cnt = 0; ret = drbg_kcapi_sym(drbg, out, &data); if (ret) return ret; } out[cnt] ^= *pos; pos++; cnt++; len--; } } /* 10.4.3 step 4.2 for last block */ if (cnt) ret = drbg_kcapi_sym(drbg, out, &data); return ret; } /* * scratchpad usage: drbg_ctr_update is interlinked with drbg_ctr_df * (and drbg_ctr_bcc, but this function does not need any temporary buffers), * the scratchpad is used as follows: * drbg_ctr_update: * temp * start: drbg->scratchpad * length: drbg_statelen(drbg) + drbg_blocklen(drbg) * note: the cipher writing into this variable works * blocklen-wise. Now, when the statelen is not a multiple * of blocklen, the generateion loop below "spills over" * by at most blocklen. Thus, we need to give sufficient * memory. * df_data * start: drbg->scratchpad + * drbg_statelen(drbg) + drbg_blocklen(drbg) * length: drbg_statelen(drbg) * * drbg_ctr_df: * pad * start: df_data + drbg_statelen(drbg) * length: drbg_blocklen(drbg) * iv * start: pad + drbg_blocklen(drbg) * length: drbg_blocklen(drbg) * temp * start: iv + drbg_blocklen(drbg) * length: drbg_satelen(drbg) + drbg_blocklen(drbg) * note: temp is the buffer that the BCC function operates * on. BCC operates blockwise. drbg_statelen(drbg) * is sufficient when the DRBG state length is a multiple * of the block size. For AES192 (and maybe other ciphers) * this is not correct and the length for temp is * insufficient (yes, that also means for such ciphers, * the final output of all BCC rounds are truncated). * Therefore, add drbg_blocklen(drbg) to cover all * possibilities. */ /* Derivation Function for CTR DRBG as defined in 10.4.2 */ static int drbg_ctr_df(struct drbg_state *drbg, unsigned char *df_data, size_t bytes_to_return, struct list_head *seedlist) { int ret = -EFAULT; unsigned char L_N[8]; /* S3 is input */ struct drbg_string S1, S2, S4, cipherin; LIST_HEAD(bcc_list); unsigned char *pad = df_data + drbg_statelen(drbg); unsigned char *iv = pad + drbg_blocklen(drbg); unsigned char *temp = iv + drbg_blocklen(drbg); size_t padlen = 0; unsigned int templen = 0; /* 10.4.2 step 7 */ unsigned int i = 0; /* 10.4.2 step 8 */ const unsigned char *K = (unsigned char *) "\x00\x01\x02\x03\x04\x05\x06\x07" "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" "\x10\x11\x12\x13\x14\x15\x16\x17" "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"; unsigned char *X; size_t generated_len = 0; size_t inputlen = 0; struct drbg_string *seed = NULL; memset(pad, 0, drbg_blocklen(drbg)); memset(iv, 0, drbg_blocklen(drbg)); /* 10.4.2 step 1 is implicit as we work byte-wise */ /* 10.4.2 step 2 */ if ((512/8) < bytes_to_return) return -EINVAL; /* 10.4.2 step 2 -- calculate the entire length of all input data */ list_for_each_entry(seed, seedlist, list) inputlen += seed->len; drbg_cpu_to_be32(inputlen, &L_N[0]); /* 10.4.2 step 3 */ drbg_cpu_to_be32(bytes_to_return, &L_N[4]); /* 10.4.2 step 5: length is L_N, input_string, one byte, padding */ padlen = (inputlen + sizeof(L_N) + 1) % (drbg_blocklen(drbg)); /* wrap the padlen appropriately */ if (padlen) padlen = drbg_blocklen(drbg) - padlen; /* * pad / padlen contains the 0x80 byte and the following zero bytes. * As the calculated padlen value only covers the number of zero * bytes, this value has to be incremented by one for the 0x80 byte. */ padlen++; pad[0] = 0x80; /* 10.4.2 step 4 -- first fill the linked list and then order it */ drbg_string_fill(&S1, iv, drbg_blocklen(drbg)); list_add_tail(&S1.list, &bcc_list); drbg_string_fill(&S2, L_N, sizeof(L_N)); list_add_tail(&S2.list, &bcc_list); list_splice_tail(seedlist, &bcc_list); drbg_string_fill(&S4, pad, padlen); list_add_tail(&S4.list, &bcc_list); /* 10.4.2 step 9 */ while (templen < (drbg_keylen(drbg) + (drbg_blocklen(drbg)))) { /* * 10.4.2 step 9.1 - the padding is implicit as the buffer * holds zeros after allocation -- even the increment of i * is irrelevant as the increment remains within length of i */ drbg_cpu_to_be32(i, iv); /* 10.4.2 step 9.2 -- BCC and concatenation with temp */ ret = drbg_ctr_bcc(drbg, temp + templen, K, &bcc_list); if (ret) goto out; /* 10.4.2 step 9.3 */ i++; templen += drbg_blocklen(drbg); } /* 10.4.2 step 11 */ X = temp + (drbg_keylen(drbg)); drbg_string_fill(&cipherin, X, drbg_blocklen(drbg)); /* 10.4.2 step 12: overwriting of outval is implemented in next step */ /* 10.4.2 step 13 */ drbg_kcapi_symsetkey(drbg, temp); while (generated_len < bytes_to_return) { short blocklen = 0; /* * 10.4.2 step 13.1: the truncation of the key length is * implicit as the key is only drbg_blocklen in size based on * the implementation of the cipher function callback */ ret = drbg_kcapi_sym(drbg, X, &cipherin); if (ret) goto out; blocklen = (drbg_blocklen(drbg) < (bytes_to_return - generated_len)) ? drbg_blocklen(drbg) : (bytes_to_return - generated_len); /* 10.4.2 step 13.2 and 14 */ memcpy(df_data + generated_len, X, blocklen); generated_len += blocklen; } ret = 0; out: memset(iv, 0, drbg_blocklen(drbg)); memset(temp, 0, drbg_statelen(drbg) + drbg_blocklen(drbg)); memset(pad, 0, drbg_blocklen(drbg)); return ret; } /* * update function of CTR DRBG as defined in 10.2.1.2 * * The reseed variable has an enhanced meaning compared to the update * functions of the other DRBGs as follows: * 0 => initial seed from initialization * 1 => reseed via drbg_seed * 2 => first invocation from drbg_ctr_update when addtl is present. In * this case, the df_data scratchpad is not deleted so that it is * available for another calls to prevent calling the DF function * again. * 3 => second invocation from drbg_ctr_update. When the update function * was called with addtl, the df_data memory already contains the * DFed addtl information and we do not need to call DF again. */ static int drbg_ctr_update(struct drbg_state *drbg, struct list_head *seed, int reseed) { int ret = -EFAULT; /* 10.2.1.2 step 1 */ unsigned char *temp = drbg->scratchpad; unsigned char *df_data = drbg->scratchpad + drbg_statelen(drbg) + drbg_blocklen(drbg); if (3 > reseed) memset(df_data, 0, drbg_statelen(drbg)); if (!reseed) { /* * The DRBG uses the CTR mode of the underlying AES cipher. The * CTR mode increments the counter value after the AES operation * but SP800-90A requires that the counter is incremented before * the AES operation. Hence, we increment it at the time we set * it by one. */ crypto_inc(drbg->V, drbg_blocklen(drbg)); ret = crypto_skcipher_setkey(drbg->ctr_handle, drbg->C, drbg_keylen(drbg)); if (ret) goto out; } /* 10.2.1.3.2 step 2 and 10.2.1.4.2 step 2 */ if (seed) { ret = drbg_ctr_df(drbg, df_data, drbg_statelen(drbg), seed); if (ret) goto out; } ret = drbg_kcapi_sym_ctr(drbg, df_data, drbg_statelen(drbg), temp, drbg_statelen(drbg)); if (ret) return ret; /* 10.2.1.2 step 5 */ ret = crypto_skcipher_setkey(drbg->ctr_handle, temp, drbg_keylen(drbg)); if (ret) goto out; /* 10.2.1.2 step 6 */ memcpy(drbg->V, temp + drbg_keylen(drbg), drbg_blocklen(drbg)); /* See above: increment counter by one to compensate timing of CTR op */ crypto_inc(drbg->V, drbg_blocklen(drbg)); ret = 0; out: memset(temp, 0, drbg_statelen(drbg) + drbg_blocklen(drbg)); if (2 != reseed) memset(df_data, 0, drbg_statelen(drbg)); return ret; } /* * scratchpad use: drbg_ctr_update is called independently from * drbg_ctr_extract_bytes. Therefore, the scratchpad is reused */ /* Generate function of CTR DRBG as defined in 10.2.1.5.2 */ static int drbg_ctr_generate(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct list_head *addtl) { int ret; int len = min_t(int, buflen, INT_MAX); /* 10.2.1.5.2 step 2 */ if (addtl && !list_empty(addtl)) { ret = drbg_ctr_update(drbg, addtl, 2); if (ret) return 0; } /* 10.2.1.5.2 step 4.1 */ ret = drbg_kcapi_sym_ctr(drbg, NULL, 0, buf, len); if (ret) return ret; /* 10.2.1.5.2 step 6 */ ret = drbg_ctr_update(drbg, NULL, 3); if (ret) len = ret; return len; } static const struct drbg_state_ops drbg_ctr_ops = { .update = drbg_ctr_update, .generate = drbg_ctr_generate, .crypto_init = drbg_init_sym_kernel, .crypto_fini = drbg_fini_sym_kernel, }; #endif /* CONFIG_CRYPTO_DRBG_CTR */ /****************************************************************** * HMAC DRBG callback functions ******************************************************************/ #if defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_HMAC) static int drbg_kcapi_hash(struct drbg_state *drbg, unsigned char *outval, const struct list_head *in); static void drbg_kcapi_hmacsetkey(struct drbg_state *drbg, const unsigned char *key); static int drbg_init_hash_kernel(struct drbg_state *drbg); static int drbg_fini_hash_kernel(struct drbg_state *drbg); #endif /* (CONFIG_CRYPTO_DRBG_HASH || CONFIG_CRYPTO_DRBG_HMAC) */ #ifdef CONFIG_CRYPTO_DRBG_HMAC #define CRYPTO_DRBG_HMAC_STRING "HMAC " MODULE_ALIAS_CRYPTO("drbg_pr_hmac_sha512"); MODULE_ALIAS_CRYPTO("drbg_nopr_hmac_sha512"); MODULE_ALIAS_CRYPTO("drbg_pr_hmac_sha384"); MODULE_ALIAS_CRYPTO("drbg_nopr_hmac_sha384"); MODULE_ALIAS_CRYPTO("drbg_pr_hmac_sha256"); MODULE_ALIAS_CRYPTO("drbg_nopr_hmac_sha256"); MODULE_ALIAS_CRYPTO("drbg_pr_hmac_sha1"); MODULE_ALIAS_CRYPTO("drbg_nopr_hmac_sha1"); /* update function of HMAC DRBG as defined in 10.1.2.2 */ static int drbg_hmac_update(struct drbg_state *drbg, struct list_head *seed, int reseed) { int ret = -EFAULT; int i = 0; struct drbg_string seed1, seed2, vdata; LIST_HEAD(seedlist); LIST_HEAD(vdatalist); if (!reseed) { /* 10.1.2.3 step 2 -- memset(0) of C is implicit with kzalloc */ memset(drbg->V, 1, drbg_statelen(drbg)); drbg_kcapi_hmacsetkey(drbg, drbg->C); } drbg_string_fill(&seed1, drbg->V, drbg_statelen(drbg)); list_add_tail(&seed1.list, &seedlist); /* buffer of seed2 will be filled in for loop below with one byte */ drbg_string_fill(&seed2, NULL, 1); list_add_tail(&seed2.list, &seedlist); /* input data of seed is allowed to be NULL at this point */ if (seed) list_splice_tail(seed, &seedlist); drbg_string_fill(&vdata, drbg->V, drbg_statelen(drbg)); list_add_tail(&vdata.list, &vdatalist); for (i = 2; 0 < i; i--) { /* first round uses 0x0, second 0x1 */ unsigned char prefix = DRBG_PREFIX0; if (1 == i) prefix = DRBG_PREFIX1; /* 10.1.2.2 step 1 and 4 -- concatenation and HMAC for key */ seed2.buf = &prefix; ret = drbg_kcapi_hash(drbg, drbg->C, &seedlist); if (ret) return ret; drbg_kcapi_hmacsetkey(drbg, drbg->C); /* 10.1.2.2 step 2 and 5 -- HMAC for V */ ret = drbg_kcapi_hash(drbg, drbg->V, &vdatalist); if (ret) return ret; /* 10.1.2.2 step 3 */ if (!seed) return ret; } return 0; } /* generate function of HMAC DRBG as defined in 10.1.2.5 */ static int drbg_hmac_generate(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct list_head *addtl) { int len = 0; int ret = 0; struct drbg_string data; LIST_HEAD(datalist); /* 10.1.2.5 step 2 */ if (addtl && !list_empty(addtl)) { ret = drbg_hmac_update(drbg, addtl, 1); if (ret) return ret; } drbg_string_fill(&data, drbg->V, drbg_statelen(drbg)); list_add_tail(&data.list, &datalist); while (len < buflen) { unsigned int outlen = 0; /* 10.1.2.5 step 4.1 */ ret = drbg_kcapi_hash(drbg, drbg->V, &datalist); if (ret) return ret; outlen = (drbg_blocklen(drbg) < (buflen - len)) ? drbg_blocklen(drbg) : (buflen - len); /* 10.1.2.5 step 4.2 */ memcpy(buf + len, drbg->V, outlen); len += outlen; } /* 10.1.2.5 step 6 */ if (addtl && !list_empty(addtl)) ret = drbg_hmac_update(drbg, addtl, 1); else ret = drbg_hmac_update(drbg, NULL, 1); if (ret) return ret; return len; } static const struct drbg_state_ops drbg_hmac_ops = { .update = drbg_hmac_update, .generate = drbg_hmac_generate, .crypto_init = drbg_init_hash_kernel, .crypto_fini = drbg_fini_hash_kernel, }; #endif /* CONFIG_CRYPTO_DRBG_HMAC */ /****************************************************************** * Hash DRBG callback functions ******************************************************************/ #ifdef CONFIG_CRYPTO_DRBG_HASH #define CRYPTO_DRBG_HASH_STRING "HASH " MODULE_ALIAS_CRYPTO("drbg_pr_sha512"); MODULE_ALIAS_CRYPTO("drbg_nopr_sha512"); MODULE_ALIAS_CRYPTO("drbg_pr_sha384"); MODULE_ALIAS_CRYPTO("drbg_nopr_sha384"); MODULE_ALIAS_CRYPTO("drbg_pr_sha256"); MODULE_ALIAS_CRYPTO("drbg_nopr_sha256"); MODULE_ALIAS_CRYPTO("drbg_pr_sha1"); MODULE_ALIAS_CRYPTO("drbg_nopr_sha1"); /* * Increment buffer * * @dst buffer to increment * @add value to add */ static inline void drbg_add_buf(unsigned char *dst, size_t dstlen, const unsigned char *add, size_t addlen) { /* implied: dstlen > addlen */ unsigned char *dstptr; const unsigned char *addptr; unsigned int remainder = 0; size_t len = addlen; dstptr = dst + (dstlen-1); addptr = add + (addlen-1); while (len) { remainder += *dstptr + *addptr; *dstptr = remainder & 0xff; remainder >>= 8; len--; dstptr--; addptr--; } len = dstlen - addlen; while (len && remainder > 0) { remainder = *dstptr + 1; *dstptr = remainder & 0xff; remainder >>= 8; len--; dstptr--; } } /* * scratchpad usage: as drbg_hash_update and drbg_hash_df are used * interlinked, the scratchpad is used as follows: * drbg_hash_update * start: drbg->scratchpad * length: drbg_statelen(drbg) * drbg_hash_df: * start: drbg->scratchpad + drbg_statelen(drbg) * length: drbg_blocklen(drbg) * * drbg_hash_process_addtl uses the scratchpad, but fully completes * before either of the functions mentioned before are invoked. Therefore, * drbg_hash_process_addtl does not need to be specifically considered. */ /* Derivation Function for Hash DRBG as defined in 10.4.1 */ static int drbg_hash_df(struct drbg_state *drbg, unsigned char *outval, size_t outlen, struct list_head *entropylist) { int ret = 0; size_t len = 0; unsigned char input[5]; unsigned char *tmp = drbg->scratchpad + drbg_statelen(drbg); struct drbg_string data; /* 10.4.1 step 3 */ input[0] = 1; drbg_cpu_to_be32((outlen * 8), &input[1]); /* 10.4.1 step 4.1 -- concatenation of data for input into hash */ drbg_string_fill(&data, input, 5); list_add(&data.list, entropylist); /* 10.4.1 step 4 */ while (len < outlen) { short blocklen = 0; /* 10.4.1 step 4.1 */ ret = drbg_kcapi_hash(drbg, tmp, entropylist); if (ret) goto out; /* 10.4.1 step 4.2 */ input[0]++; blocklen = (drbg_blocklen(drbg) < (outlen - len)) ? drbg_blocklen(drbg) : (outlen - len); memcpy(outval + len, tmp, blocklen); len += blocklen; } out: memset(tmp, 0, drbg_blocklen(drbg)); return ret; } /* update function for Hash DRBG as defined in 10.1.1.2 / 10.1.1.3 */ static int drbg_hash_update(struct drbg_state *drbg, struct list_head *seed, int reseed) { int ret = 0; struct drbg_string data1, data2; LIST_HEAD(datalist); LIST_HEAD(datalist2); unsigned char *V = drbg->scratchpad; unsigned char prefix = DRBG_PREFIX1; if (!seed) return -EINVAL; if (reseed) { /* 10.1.1.3 step 1 */ memcpy(V, drbg->V, drbg_statelen(drbg)); drbg_string_fill(&data1, &prefix, 1); list_add_tail(&data1.list, &datalist); drbg_string_fill(&data2, V, drbg_statelen(drbg)); list_add_tail(&data2.list, &datalist); } list_splice_tail(seed, &datalist); /* 10.1.1.2 / 10.1.1.3 step 2 and 3 */ ret = drbg_hash_df(drbg, drbg->V, drbg_statelen(drbg), &datalist); if (ret) goto out; /* 10.1.1.2 / 10.1.1.3 step 4 */ prefix = DRBG_PREFIX0; drbg_string_fill(&data1, &prefix, 1); list_add_tail(&data1.list, &datalist2); drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg)); list_add_tail(&data2.list, &datalist2); /* 10.1.1.2 / 10.1.1.3 step 4 */ ret = drbg_hash_df(drbg, drbg->C, drbg_statelen(drbg), &datalist2); out: memset(drbg->scratchpad, 0, drbg_statelen(drbg)); return ret; } /* processing of additional information string for Hash DRBG */ static int drbg_hash_process_addtl(struct drbg_state *drbg, struct list_head *addtl) { int ret = 0; struct drbg_string data1, data2; LIST_HEAD(datalist); unsigned char prefix = DRBG_PREFIX2; /* 10.1.1.4 step 2 */ if (!addtl || list_empty(addtl)) return 0; /* 10.1.1.4 step 2a */ drbg_string_fill(&data1, &prefix, 1); drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg)); list_add_tail(&data1.list, &datalist); list_add_tail(&data2.list, &datalist); list_splice_tail(addtl, &datalist); ret = drbg_kcapi_hash(drbg, drbg->scratchpad, &datalist); if (ret) goto out; /* 10.1.1.4 step 2b */ drbg_add_buf(drbg->V, drbg_statelen(drbg), drbg->scratchpad, drbg_blocklen(drbg)); out: memset(drbg->scratchpad, 0, drbg_blocklen(drbg)); return ret; } /* Hashgen defined in 10.1.1.4 */ static int drbg_hash_hashgen(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen) { int len = 0; int ret = 0; unsigned char *src = drbg->scratchpad; unsigned char *dst = drbg->scratchpad + drbg_statelen(drbg); struct drbg_string data; LIST_HEAD(datalist); /* 10.1.1.4 step hashgen 2 */ memcpy(src, drbg->V, drbg_statelen(drbg)); drbg_string_fill(&data, src, drbg_statelen(drbg)); list_add_tail(&data.list, &datalist); while (len < buflen) { unsigned int outlen = 0; /* 10.1.1.4 step hashgen 4.1 */ ret = drbg_kcapi_hash(drbg, dst, &datalist); if (ret) { len = ret; goto out; } outlen = (drbg_blocklen(drbg) < (buflen - len)) ? drbg_blocklen(drbg) : (buflen - len); /* 10.1.1.4 step hashgen 4.2 */ memcpy(buf + len, dst, outlen); len += outlen; /* 10.1.1.4 hashgen step 4.3 */ if (len < buflen) crypto_inc(src, drbg_statelen(drbg)); } out: memset(drbg->scratchpad, 0, (drbg_statelen(drbg) + drbg_blocklen(drbg))); return len; } /* generate function for Hash DRBG as defined in 10.1.1.4 */ static int drbg_hash_generate(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct list_head *addtl) { int len = 0; int ret = 0; union { unsigned char req[8]; __be64 req_int; } u; unsigned char prefix = DRBG_PREFIX3; struct drbg_string data1, data2; LIST_HEAD(datalist); /* 10.1.1.4 step 2 */ ret = drbg_hash_process_addtl(drbg, addtl); if (ret) return ret; /* 10.1.1.4 step 3 */ len = drbg_hash_hashgen(drbg, buf, buflen); /* this is the value H as documented in 10.1.1.4 */ /* 10.1.1.4 step 4 */ drbg_string_fill(&data1, &prefix, 1); list_add_tail(&data1.list, &datalist); drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg)); list_add_tail(&data2.list, &datalist); ret = drbg_kcapi_hash(drbg, drbg->scratchpad, &datalist); if (ret) { len = ret; goto out; } /* 10.1.1.4 step 5 */ drbg_add_buf(drbg->V, drbg_statelen(drbg), drbg->scratchpad, drbg_blocklen(drbg)); drbg_add_buf(drbg->V, drbg_statelen(drbg), drbg->C, drbg_statelen(drbg)); u.req_int = cpu_to_be64(drbg->reseed_ctr); drbg_add_buf(drbg->V, drbg_statelen(drbg), u.req, 8); out: memset(drbg->scratchpad, 0, drbg_blocklen(drbg)); return len; } /* * scratchpad usage: as update and generate are used isolated, both * can use the scratchpad */ static const struct drbg_state_ops drbg_hash_ops = { .update = drbg_hash_update, .generate = drbg_hash_generate, .crypto_init = drbg_init_hash_kernel, .crypto_fini = drbg_fini_hash_kernel, }; #endif /* CONFIG_CRYPTO_DRBG_HASH */ /****************************************************************** * Functions common for DRBG implementations ******************************************************************/ static inline int __drbg_seed(struct drbg_state *drbg, struct list_head *seed, int reseed, enum drbg_seed_state new_seed_state) { int ret = drbg->d_ops->update(drbg, seed, reseed); if (ret) return ret; drbg->seeded = new_seed_state; drbg->last_seed_time = jiffies; /* 10.1.1.2 / 10.1.1.3 step 5 */ drbg->reseed_ctr = 1; switch (drbg->seeded) { case DRBG_SEED_STATE_UNSEEDED: /* Impossible, but handle it to silence compiler warnings. */ fallthrough; case DRBG_SEED_STATE_PARTIAL: /* * Require frequent reseeds until the seed source is * fully initialized. */ drbg->reseed_threshold = 50; break; case DRBG_SEED_STATE_FULL: /* * Seed source has become fully initialized, frequent * reseeds no longer required. */ drbg->reseed_threshold = drbg_max_requests(drbg); break; } return ret; } static inline int drbg_get_random_bytes(struct drbg_state *drbg, unsigned char *entropy, unsigned int entropylen) { int ret; do { get_random_bytes(entropy, entropylen); ret = drbg_fips_continuous_test(drbg, entropy); if (ret && ret != -EAGAIN) return ret; } while (ret); return 0; } static int drbg_seed_from_random(struct drbg_state *drbg) { struct drbg_string data; LIST_HEAD(seedlist); unsigned int entropylen = drbg_sec_strength(drbg->core->flags); unsigned char entropy[32]; int ret; BUG_ON(!entropylen); BUG_ON(entropylen > sizeof(entropy)); drbg_string_fill(&data, entropy, entropylen); list_add_tail(&data.list, &seedlist); ret = drbg_get_random_bytes(drbg, entropy, entropylen); if (ret) goto out; ret = __drbg_seed(drbg, &seedlist, true, DRBG_SEED_STATE_FULL); out: memzero_explicit(entropy, entropylen); return ret; } static bool drbg_nopr_reseed_interval_elapsed(struct drbg_state *drbg) { unsigned long next_reseed; /* Don't ever reseed from get_random_bytes() in test mode. */ if (list_empty(&drbg->test_data.list)) return false; /* * Obtain fresh entropy for the nopr DRBGs after 300s have * elapsed in order to still achieve sort of partial * prediction resistance over the time domain at least. Note * that the period of 300s has been chosen to match the * CRNG_RESEED_INTERVAL of the get_random_bytes()' chacha * rngs. */ next_reseed = drbg->last_seed_time + 300 * HZ; return time_after(jiffies, next_reseed); } /* * Seeding or reseeding of the DRBG * * @drbg: DRBG state struct * @pers: personalization / additional information buffer * @reseed: 0 for initial seed process, 1 for reseeding * * return: * 0 on success * error value otherwise */ static int drbg_seed(struct drbg_state *drbg, struct drbg_string *pers, bool reseed) { int ret; unsigned char entropy[((32 + 16) * 2)]; unsigned int entropylen = drbg_sec_strength(drbg->core->flags); struct drbg_string data1; LIST_HEAD(seedlist); enum drbg_seed_state new_seed_state = DRBG_SEED_STATE_FULL; /* 9.1 / 9.2 / 9.3.1 step 3 */ if (pers && pers->len > (drbg_max_addtl(drbg))) { pr_devel("DRBG: personalization string too long %zu\n", pers->len); return -EINVAL; } if (list_empty(&drbg->test_data.list)) { drbg_string_fill(&data1, drbg->test_data.buf, drbg->test_data.len); pr_devel("DRBG: using test entropy\n"); } else { /* * Gather entropy equal to the security strength of the DRBG. * With a derivation function, a nonce is required in addition * to the entropy. A nonce must be at least 1/2 of the security * strength of the DRBG in size. Thus, entropy + nonce is 3/2 * of the strength. The consideration of a nonce is only * applicable during initial seeding. */ BUG_ON(!entropylen); if (!reseed) entropylen = ((entropylen + 1) / 2) * 3; BUG_ON((entropylen * 2) > sizeof(entropy)); /* Get seed from in-kernel /dev/urandom */ if (!rng_is_initialized()) new_seed_state = DRBG_SEED_STATE_PARTIAL; ret = drbg_get_random_bytes(drbg, entropy, entropylen); if (ret) goto out; if (!drbg->jent) { drbg_string_fill(&data1, entropy, entropylen); pr_devel("DRBG: (re)seeding with %u bytes of entropy\n", entropylen); } else { /* * Get seed from Jitter RNG, failures are * fatal only in FIPS mode. */ ret = crypto_rng_get_bytes(drbg->jent, entropy + entropylen, entropylen); if (fips_enabled && ret) { pr_devel("DRBG: jent failed with %d\n", ret); /* * Do not treat the transient failure of the * Jitter RNG as an error that needs to be * reported. The combined number of the * maximum reseed threshold times the maximum * number of Jitter RNG transient errors is * less than the reseed threshold required by * SP800-90A allowing us to treat the * transient errors as such. * * However, we mandate that at least the first * seeding operation must succeed with the * Jitter RNG. */ if (!reseed || ret != -EAGAIN) goto out; } drbg_string_fill(&data1, entropy, entropylen * 2); pr_devel("DRBG: (re)seeding with %u bytes of entropy\n", entropylen * 2); } } list_add_tail(&data1.list, &seedlist); /* * concatenation of entropy with personalization str / addtl input) * the variable pers is directly handed in by the caller, so check its * contents whether it is appropriate */ if (pers && pers->buf && 0 < pers->len) { list_add_tail(&pers->list, &seedlist); pr_devel("DRBG: using personalization string\n"); } if (!reseed) { memset(drbg->V, 0, drbg_statelen(drbg)); memset(drbg->C, 0, drbg_statelen(drbg)); } ret = __drbg_seed(drbg, &seedlist, reseed, new_seed_state); out: memzero_explicit(entropy, entropylen * 2); return ret; } /* Free all substructures in a DRBG state without the DRBG state structure */ static inline void drbg_dealloc_state(struct drbg_state *drbg) { if (!drbg) return; kfree_sensitive(drbg->Vbuf); drbg->Vbuf = NULL; drbg->V = NULL; kfree_sensitive(drbg->Cbuf); drbg->Cbuf = NULL; drbg->C = NULL; kfree_sensitive(drbg->scratchpadbuf); drbg->scratchpadbuf = NULL; drbg->reseed_ctr = 0; drbg->d_ops = NULL; drbg->core = NULL; if (IS_ENABLED(CONFIG_CRYPTO_FIPS)) { kfree_sensitive(drbg->prev); drbg->prev = NULL; drbg->fips_primed = false; } } /* * Allocate all sub-structures for a DRBG state. * The DRBG state structure must already be allocated. */ static inline int drbg_alloc_state(struct drbg_state *drbg) { int ret = -ENOMEM; unsigned int sb_size = 0; switch (drbg->core->flags & DRBG_TYPE_MASK) { #ifdef CONFIG_CRYPTO_DRBG_HMAC case DRBG_HMAC: drbg->d_ops = &drbg_hmac_ops; break; #endif /* CONFIG_CRYPTO_DRBG_HMAC */ #ifdef CONFIG_CRYPTO_DRBG_HASH case DRBG_HASH: drbg->d_ops = &drbg_hash_ops; break; #endif /* CONFIG_CRYPTO_DRBG_HASH */ #ifdef CONFIG_CRYPTO_DRBG_CTR case DRBG_CTR: drbg->d_ops = &drbg_ctr_ops; break; #endif /* CONFIG_CRYPTO_DRBG_CTR */ default: ret = -EOPNOTSUPP; goto err; } ret = drbg->d_ops->crypto_init(drbg); if (ret < 0) goto err; drbg->Vbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL); if (!drbg->Vbuf) { ret = -ENOMEM; goto fini; } drbg->V = PTR_ALIGN(drbg->Vbuf, ret + 1); drbg->Cbuf = kmalloc(drbg_statelen(drbg) + ret, GFP_KERNEL); if (!drbg->Cbuf) { ret = -ENOMEM; goto fini; } drbg->C = PTR_ALIGN(drbg->Cbuf, ret + 1); /* scratchpad is only generated for CTR and Hash */ if (drbg->core->flags & DRBG_HMAC) sb_size = 0; else if (drbg->core->flags & DRBG_CTR) sb_size = drbg_statelen(drbg) + drbg_blocklen(drbg) + /* temp */ drbg_statelen(drbg) + /* df_data */ drbg_blocklen(drbg) + /* pad */ drbg_blocklen(drbg) + /* iv */ drbg_statelen(drbg) + drbg_blocklen(drbg); /* temp */ else sb_size = drbg_statelen(drbg) + drbg_blocklen(drbg); if (0 < sb_size) { drbg->scratchpadbuf = kzalloc(sb_size + ret, GFP_KERNEL); if (!drbg->scratchpadbuf) { ret = -ENOMEM; goto fini; } drbg->scratchpad = PTR_ALIGN(drbg->scratchpadbuf, ret + 1); } if (IS_ENABLED(CONFIG_CRYPTO_FIPS)) { drbg->prev = kzalloc(drbg_sec_strength(drbg->core->flags), GFP_KERNEL); if (!drbg->prev) { ret = -ENOMEM; goto fini; } drbg->fips_primed = false; } return 0; fini: drbg->d_ops->crypto_fini(drbg); err: drbg_dealloc_state(drbg); return ret; } /************************************************************************* * DRBG interface functions *************************************************************************/ /* * DRBG generate function as required by SP800-90A - this function * generates random numbers * * @drbg DRBG state handle * @buf Buffer where to store the random numbers -- the buffer must already * be pre-allocated by caller * @buflen Length of output buffer - this value defines the number of random * bytes pulled from DRBG * @addtl Additional input that is mixed into state, may be NULL -- note * the entropy is pulled by the DRBG internally unconditionally * as defined in SP800-90A. The additional input is mixed into * the state in addition to the pulled entropy. * * return: 0 when all bytes are generated; < 0 in case of an error */ static int drbg_generate(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct drbg_string *addtl) { int len = 0; LIST_HEAD(addtllist); if (!drbg->core) { pr_devel("DRBG: not yet seeded\n"); return -EINVAL; } if (0 == buflen || !buf) { pr_devel("DRBG: no output buffer provided\n"); return -EINVAL; } if (addtl && NULL == addtl->buf && 0 < addtl->len) { pr_devel("DRBG: wrong format of additional information\n"); return -EINVAL; } /* 9.3.1 step 2 */ len = -EINVAL; if (buflen > (drbg_max_request_bytes(drbg))) { pr_devel("DRBG: requested random numbers too large %u\n", buflen); goto err; } /* 9.3.1 step 3 is implicit with the chosen DRBG */ /* 9.3.1 step 4 */ if (addtl && addtl->len > (drbg_max_addtl(drbg))) { pr_devel("DRBG: additional information string too long %zu\n", addtl->len); goto err; } /* 9.3.1 step 5 is implicit with the chosen DRBG */ /* * 9.3.1 step 6 and 9 supplemented by 9.3.2 step c is implemented * here. The spec is a bit convoluted here, we make it simpler. */ if (drbg->reseed_threshold < drbg->reseed_ctr) drbg->seeded = DRBG_SEED_STATE_UNSEEDED; if (drbg->pr || drbg->seeded == DRBG_SEED_STATE_UNSEEDED) { pr_devel("DRBG: reseeding before generation (prediction " "resistance: %s, state %s)\n", drbg->pr ? "true" : "false", (drbg->seeded == DRBG_SEED_STATE_FULL ? "seeded" : "unseeded")); /* 9.3.1 steps 7.1 through 7.3 */ len = drbg_seed(drbg, addtl, true); if (len) goto err; /* 9.3.1 step 7.4 */ addtl = NULL; } else if (rng_is_initialized() && (drbg->seeded == DRBG_SEED_STATE_PARTIAL || drbg_nopr_reseed_interval_elapsed(drbg))) { len = drbg_seed_from_random(drbg); if (len) goto err; } if (addtl && 0 < addtl->len) list_add_tail(&addtl->list, &addtllist); /* 9.3.1 step 8 and 10 */ len = drbg->d_ops->generate(drbg, buf, buflen, &addtllist); /* 10.1.1.4 step 6, 10.1.2.5 step 7, 10.2.1.5.2 step 7 */ drbg->reseed_ctr++; if (0 >= len) goto err; /* * Section 11.3.3 requires to re-perform self tests after some * generated random numbers. The chosen value after which self * test is performed is arbitrary, but it should be reasonable. * However, we do not perform the self tests because of the following * reasons: it is mathematically impossible that the initial self tests * were successfully and the following are not. If the initial would * pass and the following would not, the kernel integrity is violated. * In this case, the entire kernel operation is questionable and it * is unlikely that the integrity violation only affects the * correct operation of the DRBG. * * Albeit the following code is commented out, it is provided in * case somebody has a need to implement the test of 11.3.3. */ #if 0 if (drbg->reseed_ctr && !(drbg->reseed_ctr % 4096)) { int err = 0; pr_devel("DRBG: start to perform self test\n"); if (drbg->core->flags & DRBG_HMAC) err = alg_test("drbg_pr_hmac_sha256", "drbg_pr_hmac_sha256", 0, 0); else if (drbg->core->flags & DRBG_CTR) err = alg_test("drbg_pr_ctr_aes128", "drbg_pr_ctr_aes128", 0, 0); else err = alg_test("drbg_pr_sha256", "drbg_pr_sha256", 0, 0); if (err) { pr_err("DRBG: periodical self test failed\n"); /* * uninstantiate implies that from now on, only errors * are returned when reusing this DRBG cipher handle */ drbg_uninstantiate(drbg); return 0; } else { pr_devel("DRBG: self test successful\n"); } } #endif /* * All operations were successful, return 0 as mandated by * the kernel crypto API interface. */ len = 0; err: return len; } /* * Wrapper around drbg_generate which can pull arbitrary long strings * from the DRBG without hitting the maximum request limitation. * * Parameters: see drbg_generate * Return codes: see drbg_generate -- if one drbg_generate request fails, * the entire drbg_generate_long request fails */ static int drbg_generate_long(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct drbg_string *addtl) { unsigned int len = 0; unsigned int slice = 0; do { int err = 0; unsigned int chunk = 0; slice = ((buflen - len) / drbg_max_request_bytes(drbg)); chunk = slice ? drbg_max_request_bytes(drbg) : (buflen - len); mutex_lock(&drbg->drbg_mutex); err = drbg_generate(drbg, buf + len, chunk, addtl); mutex_unlock(&drbg->drbg_mutex); if (0 > err) return err; len += chunk; } while (slice > 0 && (len < buflen)); return 0; } static int drbg_prepare_hrng(struct drbg_state *drbg) { /* We do not need an HRNG in test mode. */ if (list_empty(&drbg->test_data.list)) return 0; drbg->jent = crypto_alloc_rng("jitterentropy_rng", 0, 0); if (IS_ERR(drbg->jent)) { const int err = PTR_ERR(drbg->jent); drbg->jent = NULL; if (fips_enabled) return err; pr_info("DRBG: Continuing without Jitter RNG\n"); } return 0; } /* * DRBG instantiation function as required by SP800-90A - this function * sets up the DRBG handle, performs the initial seeding and all sanity * checks required by SP800-90A * * @drbg memory of state -- if NULL, new memory is allocated * @pers Personalization string that is mixed into state, may be NULL -- note * the entropy is pulled by the DRBG internally unconditionally * as defined in SP800-90A. The additional input is mixed into * the state in addition to the pulled entropy. * @coreref reference to core * @pr prediction resistance enabled * * return * 0 on success * error value otherwise */ static int drbg_instantiate(struct drbg_state *drbg, struct drbg_string *pers, int coreref, bool pr) { int ret; bool reseed = true; pr_devel("DRBG: Initializing DRBG core %d with prediction resistance " "%s\n", coreref, pr ? "enabled" : "disabled"); mutex_lock(&drbg->drbg_mutex); /* 9.1 step 1 is implicit with the selected DRBG type */ /* * 9.1 step 2 is implicit as caller can select prediction resistance * and the flag is copied into drbg->flags -- * all DRBG types support prediction resistance */ /* 9.1 step 4 is implicit in drbg_sec_strength */ if (!drbg->core) { drbg->core = &drbg_cores[coreref]; drbg->pr = pr; drbg->seeded = DRBG_SEED_STATE_UNSEEDED; drbg->last_seed_time = 0; drbg->reseed_threshold = drbg_max_requests(drbg); ret = drbg_alloc_state(drbg); if (ret) goto unlock; ret = drbg_prepare_hrng(drbg); if (ret) goto free_everything; reseed = false; } ret = drbg_seed(drbg, pers, reseed); if (ret && !reseed) goto free_everything; mutex_unlock(&drbg->drbg_mutex); return ret; unlock: mutex_unlock(&drbg->drbg_mutex); return ret; free_everything: mutex_unlock(&drbg->drbg_mutex); drbg_uninstantiate(drbg); return ret; } /* * DRBG uninstantiate function as required by SP800-90A - this function * frees all buffers and the DRBG handle * * @drbg DRBG state handle * * return * 0 on success */ static int drbg_uninstantiate(struct drbg_state *drbg) { if (!IS_ERR_OR_NULL(drbg->jent)) crypto_free_rng(drbg->jent); drbg->jent = NULL; if (drbg->d_ops) drbg->d_ops->crypto_fini(drbg); drbg_dealloc_state(drbg); /* no scrubbing of test_data -- this shall survive an uninstantiate */ return 0; } /* * Helper function for setting the test data in the DRBG * * @drbg DRBG state handle * @data test data * @len test data length */ static void drbg_kcapi_set_entropy(struct crypto_rng *tfm, const u8 *data, unsigned int len) { struct drbg_state *drbg = crypto_rng_ctx(tfm); mutex_lock(&drbg->drbg_mutex); drbg_string_fill(&drbg->test_data, data, len); mutex_unlock(&drbg->drbg_mutex); } /*************************************************************** * Kernel crypto API cipher invocations requested by DRBG ***************************************************************/ #if defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_HMAC) struct sdesc { struct shash_desc shash; char ctx[]; }; static int drbg_init_hash_kernel(struct drbg_state *drbg) { struct sdesc *sdesc; struct crypto_shash *tfm; tfm = crypto_alloc_shash(drbg->core->backend_cra_name, 0, 0); if (IS_ERR(tfm)) { pr_info("DRBG: could not allocate digest TFM handle: %s\n", drbg->core->backend_cra_name); return PTR_ERR(tfm); } BUG_ON(drbg_blocklen(drbg) != crypto_shash_digestsize(tfm)); sdesc = kzalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), GFP_KERNEL); if (!sdesc) { crypto_free_shash(tfm); return -ENOMEM; } sdesc->shash.tfm = tfm; drbg->priv_data = sdesc; return 0; } static int drbg_fini_hash_kernel(struct drbg_state *drbg) { struct sdesc *sdesc = drbg->priv_data; if (sdesc) { crypto_free_shash(sdesc->shash.tfm); kfree_sensitive(sdesc); } drbg->priv_data = NULL; return 0; } static void drbg_kcapi_hmacsetkey(struct drbg_state *drbg, const unsigned char *key) { struct sdesc *sdesc = drbg->priv_data; crypto_shash_setkey(sdesc->shash.tfm, key, drbg_statelen(drbg)); } static int drbg_kcapi_hash(struct drbg_state *drbg, unsigned char *outval, const struct list_head *in) { struct sdesc *sdesc = drbg->priv_data; struct drbg_string *input = NULL; crypto_shash_init(&sdesc->shash); list_for_each_entry(input, in, list) crypto_shash_update(&sdesc->shash, input->buf, input->len); return crypto_shash_final(&sdesc->shash, outval); } #endif /* (CONFIG_CRYPTO_DRBG_HASH || CONFIG_CRYPTO_DRBG_HMAC) */ #ifdef CONFIG_CRYPTO_DRBG_CTR static int drbg_fini_sym_kernel(struct drbg_state *drbg) { struct crypto_cipher *tfm = (struct crypto_cipher *)drbg->priv_data; if (tfm) crypto_free_cipher(tfm); drbg->priv_data = NULL; if (drbg->ctr_handle) crypto_free_skcipher(drbg->ctr_handle); drbg->ctr_handle = NULL; if (drbg->ctr_req) skcipher_request_free(drbg->ctr_req); drbg->ctr_req = NULL; kfree(drbg->outscratchpadbuf); drbg->outscratchpadbuf = NULL; return 0; } static int drbg_init_sym_kernel(struct drbg_state *drbg) { struct crypto_cipher *tfm; struct crypto_skcipher *sk_tfm; struct skcipher_request *req; unsigned int alignmask; char ctr_name[CRYPTO_MAX_ALG_NAME]; tfm = crypto_alloc_cipher(drbg->core->backend_cra_name, 0, 0); if (IS_ERR(tfm)) { pr_info("DRBG: could not allocate cipher TFM handle: %s\n", drbg->core->backend_cra_name); return PTR_ERR(tfm); } BUG_ON(drbg_blocklen(drbg) != crypto_cipher_blocksize(tfm)); drbg->priv_data = tfm; if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)", drbg->core->backend_cra_name) >= CRYPTO_MAX_ALG_NAME) { drbg_fini_sym_kernel(drbg); return -EINVAL; } sk_tfm = crypto_alloc_skcipher(ctr_name, 0, 0); if (IS_ERR(sk_tfm)) { pr_info("DRBG: could not allocate CTR cipher TFM handle: %s\n", ctr_name); drbg_fini_sym_kernel(drbg); return PTR_ERR(sk_tfm); } drbg->ctr_handle = sk_tfm; crypto_init_wait(&drbg->ctr_wait); req = skcipher_request_alloc(sk_tfm, GFP_KERNEL); if (!req) { pr_info("DRBG: could not allocate request queue\n"); drbg_fini_sym_kernel(drbg); return -ENOMEM; } drbg->ctr_req = req; skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &drbg->ctr_wait); alignmask = crypto_skcipher_alignmask(sk_tfm); drbg->outscratchpadbuf = kmalloc(DRBG_OUTSCRATCHLEN + alignmask, GFP_KERNEL); if (!drbg->outscratchpadbuf) { drbg_fini_sym_kernel(drbg); return -ENOMEM; } drbg->outscratchpad = (u8 *)PTR_ALIGN(drbg->outscratchpadbuf, alignmask + 1); sg_init_table(&drbg->sg_in, 1); sg_init_one(&drbg->sg_out, drbg->outscratchpad, DRBG_OUTSCRATCHLEN); return alignmask; } static void drbg_kcapi_symsetkey(struct drbg_state *drbg, const unsigned char *key) { struct crypto_cipher *tfm = drbg->priv_data; crypto_cipher_setkey(tfm, key, (drbg_keylen(drbg))); } static int drbg_kcapi_sym(struct drbg_state *drbg, unsigned char *outval, const struct drbg_string *in) { struct crypto_cipher *tfm = drbg->priv_data; /* there is only component in *in */ BUG_ON(in->len < drbg_blocklen(drbg)); crypto_cipher_encrypt_one(tfm, outval, in->buf); return 0; } static int drbg_kcapi_sym_ctr(struct drbg_state *drbg, u8 *inbuf, u32 inlen, u8 *outbuf, u32 outlen) { struct scatterlist *sg_in = &drbg->sg_in, *sg_out = &drbg->sg_out; u32 scratchpad_use = min_t(u32, outlen, DRBG_OUTSCRATCHLEN); int ret; if (inbuf) { /* Use caller-provided input buffer */ sg_set_buf(sg_in, inbuf, inlen); } else { /* Use scratchpad for in-place operation */ inlen = scratchpad_use; memset(drbg->outscratchpad, 0, scratchpad_use); sg_set_buf(sg_in, drbg->outscratchpad, scratchpad_use); } while (outlen) { u32 cryptlen = min3(inlen, outlen, (u32)DRBG_OUTSCRATCHLEN); /* Output buffer may not be valid for SGL, use scratchpad */ skcipher_request_set_crypt(drbg->ctr_req, sg_in, sg_out, cryptlen, drbg->V); ret = crypto_wait_req(crypto_skcipher_encrypt(drbg->ctr_req), &drbg->ctr_wait); if (ret) goto out; crypto_init_wait(&drbg->ctr_wait); memcpy(outbuf, drbg->outscratchpad, cryptlen); memzero_explicit(drbg->outscratchpad, cryptlen); outlen -= cryptlen; outbuf += cryptlen; } ret = 0; out: return ret; } #endif /* CONFIG_CRYPTO_DRBG_CTR */ /*************************************************************** * Kernel crypto API interface to register DRBG ***************************************************************/ /* * Look up the DRBG flags by given kernel crypto API cra_name * The code uses the drbg_cores definition to do this * * @cra_name kernel crypto API cra_name * @coreref reference to integer which is filled with the pointer to * the applicable core * @pr reference for setting prediction resistance * * return: flags */ static inline void drbg_convert_tfm_core(const char *cra_driver_name, int *coreref, bool *pr) { int i = 0; size_t start = 0; int len = 0; *pr = true; /* disassemble the names */ if (!memcmp(cra_driver_name, "drbg_nopr_", 10)) { start = 10; *pr = false; } else if (!memcmp(cra_driver_name, "drbg_pr_", 8)) { start = 8; } else { return; } /* remove the first part */ len = strlen(cra_driver_name) - start; for (i = 0; ARRAY_SIZE(drbg_cores) > i; i++) { if (!memcmp(cra_driver_name + start, drbg_cores[i].cra_name, len)) { *coreref = i; return; } } } static int drbg_kcapi_init(struct crypto_tfm *tfm) { struct drbg_state *drbg = crypto_tfm_ctx(tfm); mutex_init(&drbg->drbg_mutex); return 0; } static void drbg_kcapi_cleanup(struct crypto_tfm *tfm) { drbg_uninstantiate(crypto_tfm_ctx(tfm)); } /* * Generate random numbers invoked by the kernel crypto API: * The API of the kernel crypto API is extended as follows: * * src is additional input supplied to the RNG. * slen is the length of src. * dst is the output buffer where random data is to be stored. * dlen is the length of dst. */ static int drbg_kcapi_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen) { struct drbg_state *drbg = crypto_rng_ctx(tfm); struct drbg_string *addtl = NULL; struct drbg_string string; if (slen) { /* linked list variable is now local to allow modification */ drbg_string_fill(&string, src, slen); addtl = &string; } return drbg_generate_long(drbg, dst, dlen, addtl); } /* * Seed the DRBG invoked by the kernel crypto API */ static int drbg_kcapi_seed(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { struct drbg_state *drbg = crypto_rng_ctx(tfm); struct crypto_tfm *tfm_base = crypto_rng_tfm(tfm); bool pr = false; struct drbg_string string; struct drbg_string *seed_string = NULL; int coreref = 0; drbg_convert_tfm_core(crypto_tfm_alg_driver_name(tfm_base), &coreref, &pr); if (0 < slen) { drbg_string_fill(&string, seed, slen); seed_string = &string; } return drbg_instantiate(drbg, seed_string, coreref, pr); } /*************************************************************** * Kernel module: code to load the module ***************************************************************/ /* * Tests as defined in 11.3.2 in addition to the cipher tests: testing * of the error handling. * * Note: testing of failing seed source as defined in 11.3.2 is not applicable * as seed source of get_random_bytes does not fail. * * Note 2: There is no sensible way of testing the reseed counter * enforcement, so skip it. */ static inline int __init drbg_healthcheck_sanity(void) { int len = 0; #define OUTBUFLEN 16 unsigned char buf[OUTBUFLEN]; struct drbg_state *drbg = NULL; int ret; int rc = -EFAULT; bool pr = false; int coreref = 0; struct drbg_string addtl; size_t max_addtllen, max_request_bytes; /* only perform test in FIPS mode */ if (!fips_enabled) return 0; #ifdef CONFIG_CRYPTO_DRBG_CTR drbg_convert_tfm_core("drbg_nopr_ctr_aes128", &coreref, &pr); #elif defined CONFIG_CRYPTO_DRBG_HASH drbg_convert_tfm_core("drbg_nopr_sha256", &coreref, &pr); #else drbg_convert_tfm_core("drbg_nopr_hmac_sha256", &coreref, &pr); #endif drbg = kzalloc(sizeof(struct drbg_state), GFP_KERNEL); if (!drbg) return -ENOMEM; mutex_init(&drbg->drbg_mutex); drbg->core = &drbg_cores[coreref]; drbg->reseed_threshold = drbg_max_requests(drbg); /* * if the following tests fail, it is likely that there is a buffer * overflow as buf is much smaller than the requested or provided * string lengths -- in case the error handling does not succeed * we may get an OOPS. And we want to get an OOPS as this is a * grave bug. */ max_addtllen = drbg_max_addtl(drbg); max_request_bytes = drbg_max_request_bytes(drbg); drbg_string_fill(&addtl, buf, max_addtllen + 1); /* overflow addtllen with additonal info string */ len = drbg_generate(drbg, buf, OUTBUFLEN, &addtl); BUG_ON(0 < len); /* overflow max_bits */ len = drbg_generate(drbg, buf, (max_request_bytes + 1), NULL); BUG_ON(0 < len); /* overflow max addtllen with personalization string */ ret = drbg_seed(drbg, &addtl, false); BUG_ON(0 == ret); /* all tests passed */ rc = 0; pr_devel("DRBG: Sanity tests for failure code paths successfully " "completed\n"); kfree(drbg); return rc; } static struct rng_alg drbg_algs[22]; /* * Fill the array drbg_algs used to register the different DRBGs * with the kernel crypto API. To fill the array, the information * from drbg_cores[] is used. */ static inline void __init drbg_fill_array(struct rng_alg *alg, const struct drbg_core *core, int pr) { int pos = 0; static int priority = 200; memcpy(alg->base.cra_name, "stdrng", 6); if (pr) { memcpy(alg->base.cra_driver_name, "drbg_pr_", 8); pos = 8; } else { memcpy(alg->base.cra_driver_name, "drbg_nopr_", 10); pos = 10; } memcpy(alg->base.cra_driver_name + pos, core->cra_name, strlen(core->cra_name)); alg->base.cra_priority = priority; priority++; /* * If FIPS mode enabled, the selected DRBG shall have the * highest cra_priority over other stdrng instances to ensure * it is selected. */ if (fips_enabled) alg->base.cra_priority += 200; alg->base.cra_ctxsize = sizeof(struct drbg_state); alg->base.cra_module = THIS_MODULE; alg->base.cra_init = drbg_kcapi_init; alg->base.cra_exit = drbg_kcapi_cleanup; alg->generate = drbg_kcapi_random; alg->seed = drbg_kcapi_seed; alg->set_ent = drbg_kcapi_set_entropy; alg->seedsize = 0; } static int __init drbg_init(void) { unsigned int i = 0; /* pointer to drbg_algs */ unsigned int j = 0; /* pointer to drbg_cores */ int ret; ret = drbg_healthcheck_sanity(); if (ret) return ret; if (ARRAY_SIZE(drbg_cores) * 2 > ARRAY_SIZE(drbg_algs)) { pr_info("DRBG: Cannot register all DRBG types" "(slots needed: %zu, slots available: %zu)\n", ARRAY_SIZE(drbg_cores) * 2, ARRAY_SIZE(drbg_algs)); return -EFAULT; } /* * each DRBG definition can be used with PR and without PR, thus * we instantiate each DRBG in drbg_cores[] twice. * * As the order of placing them into the drbg_algs array matters * (the later DRBGs receive a higher cra_priority) we register the * prediction resistance DRBGs first as the should not be too * interesting. */ for (j = 0; ARRAY_SIZE(drbg_cores) > j; j++, i++) drbg_fill_array(&drbg_algs[i], &drbg_cores[j], 1); for (j = 0; ARRAY_SIZE(drbg_cores) > j; j++, i++) drbg_fill_array(&drbg_algs[i], &drbg_cores[j], 0); return crypto_register_rngs(drbg_algs, (ARRAY_SIZE(drbg_cores) * 2)); } static void __exit drbg_exit(void) { crypto_unregister_rngs(drbg_algs, (ARRAY_SIZE(drbg_cores) * 2)); } subsys_initcall(drbg_init); module_exit(drbg_exit); #ifndef CRYPTO_DRBG_HASH_STRING #define CRYPTO_DRBG_HASH_STRING "" #endif #ifndef CRYPTO_DRBG_HMAC_STRING #define CRYPTO_DRBG_HMAC_STRING "" #endif #ifndef CRYPTO_DRBG_CTR_STRING #define CRYPTO_DRBG_CTR_STRING "" #endif MODULE_LICENSE("GPL"); MODULE_AUTHOR("Stephan Mueller <smueller@chronox.de>"); MODULE_DESCRIPTION("NIST SP800-90A Deterministic Random Bit Generator (DRBG) " "using following cores: " CRYPTO_DRBG_HASH_STRING CRYPTO_DRBG_HMAC_STRING CRYPTO_DRBG_CTR_STRING); MODULE_ALIAS_CRYPTO("stdrng"); MODULE_IMPORT_NS(CRYPTO_INTERNAL);
410 410 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * proc.c - procfs support for Protocol family CAN core module * * Copyright (c) 2002-2007 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/if_arp.h> #include <linux/can/can-ml.h> #include <linux/can/core.h> #include "af_can.h" /* * proc filenames for the PF_CAN core */ #define CAN_PROC_STATS "stats" #define CAN_PROC_RESET_STATS "reset_stats" #define CAN_PROC_RCVLIST_ALL "rcvlist_all" #define CAN_PROC_RCVLIST_FIL "rcvlist_fil" #define CAN_PROC_RCVLIST_INV "rcvlist_inv" #define CAN_PROC_RCVLIST_SFF "rcvlist_sff" #define CAN_PROC_RCVLIST_EFF "rcvlist_eff" #define CAN_PROC_RCVLIST_ERR "rcvlist_err" static int user_reset; static const char rx_list_name[][8] = { [RX_ERR] = "rx_err", [RX_ALL] = "rx_all", [RX_FIL] = "rx_fil", [RX_INV] = "rx_inv", }; /* * af_can statistics stuff */ static void can_init_stats(struct net *net) { struct can_pkg_stats *pkg_stats = net->can.pkg_stats; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; /* * This memset function is called from a timer context (when * can_stattimer is active which is the default) OR in a process * context (reading the proc_fs when can_stattimer is disabled). */ memset(pkg_stats, 0, sizeof(struct can_pkg_stats)); pkg_stats->jiffies_init = jiffies; rcv_lists_stats->stats_reset++; if (user_reset) { user_reset = 0; rcv_lists_stats->user_reset++; } } static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, unsigned long count) { if (oldjif == newjif) return 0; /* see can_stat_update() - this should NEVER happen! */ if (count > (ULONG_MAX / HZ)) { printk(KERN_ERR "can: calc_rate: count exceeded! %ld\n", count); return 99999999; } return (count * HZ) / (newjif - oldjif); } void can_stat_update(struct timer_list *t) { struct net *net = from_timer(net, t, can.stattimer); struct can_pkg_stats *pkg_stats = net->can.pkg_stats; unsigned long j = jiffies; /* snapshot */ /* restart counting in timer context on user request */ if (user_reset) can_init_stats(net); /* restart counting on jiffies overflow */ if (j < pkg_stats->jiffies_init) can_init_stats(net); /* prevent overflow in calc_rate() */ if (pkg_stats->rx_frames > (ULONG_MAX / HZ)) can_init_stats(net); /* prevent overflow in calc_rate() */ if (pkg_stats->tx_frames > (ULONG_MAX / HZ)) can_init_stats(net); /* matches overflow - very improbable */ if (pkg_stats->matches > (ULONG_MAX / 100)) can_init_stats(net); /* calc total values */ if (pkg_stats->rx_frames) pkg_stats->total_rx_match_ratio = (pkg_stats->matches * 100) / pkg_stats->rx_frames; pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j, pkg_stats->tx_frames); pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j, pkg_stats->rx_frames); /* calc current values */ if (pkg_stats->rx_frames_delta) pkg_stats->current_rx_match_ratio = (pkg_stats->matches_delta * 100) / pkg_stats->rx_frames_delta; pkg_stats->current_tx_rate = calc_rate(0, HZ, pkg_stats->tx_frames_delta); pkg_stats->current_rx_rate = calc_rate(0, HZ, pkg_stats->rx_frames_delta); /* check / update maximum values */ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate) pkg_stats->max_tx_rate = pkg_stats->current_tx_rate; if (pkg_stats->max_rx_rate < pkg_stats->current_rx_rate) pkg_stats->max_rx_rate = pkg_stats->current_rx_rate; if (pkg_stats->max_rx_match_ratio < pkg_stats->current_rx_match_ratio) pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio; /* clear values for 'current rate' calculation */ pkg_stats->tx_frames_delta = 0; pkg_stats->rx_frames_delta = 0; pkg_stats->matches_delta = 0; /* restart timer (one second) */ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); } /* * proc read functions */ static void can_print_rcvlist(struct seq_file *m, struct hlist_head *rx_list, struct net_device *dev) { struct receiver *r; hlist_for_each_entry_rcu(r, rx_list, list) { char *fmt = (r->can_id & CAN_EFF_FLAG)? " %-5s %08x %08x %pK %pK %8ld %s\n" : " %-5s %03x %08x %pK %pK %8ld %s\n"; seq_printf(m, fmt, DNAME(dev), r->can_id, r->mask, r->func, r->data, r->matches, r->ident); } } static void can_print_recv_banner(struct seq_file *m) { /* * can1. 00000000 00000000 00000000 * ....... 0 tp20 */ if (IS_ENABLED(CONFIG_64BIT)) seq_puts(m, " device can_id can_mask function userdata matches ident\n"); else seq_puts(m, " device can_id can_mask function userdata matches ident\n"); } static int can_stats_proc_show(struct seq_file *m, void *v) { struct net *net = m->private; struct can_pkg_stats *pkg_stats = net->can.pkg_stats; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; seq_putc(m, '\n'); seq_printf(m, " %8ld transmitted frames (TXF)\n", pkg_stats->tx_frames); seq_printf(m, " %8ld received frames (RXF)\n", pkg_stats->rx_frames); seq_printf(m, " %8ld matched frames (RXMF)\n", pkg_stats->matches); seq_putc(m, '\n'); if (net->can.stattimer.function == can_stat_update) { seq_printf(m, " %8ld %% total match ratio (RXMR)\n", pkg_stats->total_rx_match_ratio); seq_printf(m, " %8ld frames/s total tx rate (TXR)\n", pkg_stats->total_tx_rate); seq_printf(m, " %8ld frames/s total rx rate (RXR)\n", pkg_stats->total_rx_rate); seq_putc(m, '\n'); seq_printf(m, " %8ld %% current match ratio (CRXMR)\n", pkg_stats->current_rx_match_ratio); seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n", pkg_stats->current_tx_rate); seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n", pkg_stats->current_rx_rate); seq_putc(m, '\n'); seq_printf(m, " %8ld %% max match ratio (MRXMR)\n", pkg_stats->max_rx_match_ratio); seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n", pkg_stats->max_tx_rate); seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n", pkg_stats->max_rx_rate); seq_putc(m, '\n'); } seq_printf(m, " %8ld current receive list entries (CRCV)\n", rcv_lists_stats->rcv_entries); seq_printf(m, " %8ld maximum receive list entries (MRCV)\n", rcv_lists_stats->rcv_entries_max); if (rcv_lists_stats->stats_reset) seq_printf(m, "\n %8ld statistic resets (STR)\n", rcv_lists_stats->stats_reset); if (rcv_lists_stats->user_reset) seq_printf(m, " %8ld user statistic resets (USTR)\n", rcv_lists_stats->user_reset); seq_putc(m, '\n'); return 0; } static int can_reset_stats_proc_show(struct seq_file *m, void *v) { struct net *net = m->private; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; struct can_pkg_stats *pkg_stats = net->can.pkg_stats; user_reset = 1; if (net->can.stattimer.function == can_stat_update) { seq_printf(m, "Scheduled statistic reset #%ld.\n", rcv_lists_stats->stats_reset + 1); } else { if (pkg_stats->jiffies_init != jiffies) can_init_stats(net); seq_printf(m, "Performed statistic reset #%ld.\n", rcv_lists_stats->stats_reset); } return 0; } static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, struct net_device *dev, struct can_dev_rcv_lists *dev_rcv_lists) { if (!hlist_empty(&dev_rcv_lists->rx[idx])) { can_print_recv_banner(m); can_print_rcvlist(m, &dev_rcv_lists->rx[idx], dev); } else seq_printf(m, " (%s: no entry)\n", DNAME(dev)); } static int can_rcvlist_proc_show(struct seq_file *m, void *v) { /* double cast to prevent GCC warning */ int idx = (int)(long)pde_data(m->file->f_inode); struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]); rcu_read_lock(); /* receive list for 'all' CAN devices (dev == NULL) */ dev_rcv_lists = net->can.rx_alldev_list; can_rcvlist_proc_show_one(m, idx, NULL, dev_rcv_lists); /* receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); if (can_ml) can_rcvlist_proc_show_one(m, idx, dev, &can_ml->dev_rcv_lists); } rcu_read_unlock(); seq_putc(m, '\n'); return 0; } static inline void can_rcvlist_proc_show_array(struct seq_file *m, struct net_device *dev, struct hlist_head *rcv_array, unsigned int rcv_array_sz) { unsigned int i; int all_empty = 1; /* check whether at least one list is non-empty */ for (i = 0; i < rcv_array_sz; i++) if (!hlist_empty(&rcv_array[i])) { all_empty = 0; break; } if (!all_empty) { can_print_recv_banner(m); for (i = 0; i < rcv_array_sz; i++) { if (!hlist_empty(&rcv_array[i])) can_print_rcvlist(m, &rcv_array[i], dev); } } else seq_printf(m, " (%s: no entry)\n", DNAME(dev)); } static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; /* RX_SFF */ seq_puts(m, "\nreceive list 'rx_sff':\n"); rcu_read_lock(); /* sff receive list for 'all' CAN devices (dev == NULL) */ dev_rcv_lists = net->can.rx_alldev_list; can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_sff, ARRAY_SIZE(dev_rcv_lists->rx_sff)); /* sff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); if (can_ml) { dev_rcv_lists = &can_ml->dev_rcv_lists; can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff, ARRAY_SIZE(dev_rcv_lists->rx_sff)); } } rcu_read_unlock(); seq_putc(m, '\n'); return 0; } static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; /* RX_EFF */ seq_puts(m, "\nreceive list 'rx_eff':\n"); rcu_read_lock(); /* eff receive list for 'all' CAN devices (dev == NULL) */ dev_rcv_lists = net->can.rx_alldev_list; can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_eff, ARRAY_SIZE(dev_rcv_lists->rx_eff)); /* eff receive list for registered CAN devices */ for_each_netdev_rcu(net, dev) { struct can_ml_priv *can_ml = can_get_ml_priv(dev); if (can_ml) { dev_rcv_lists = &can_ml->dev_rcv_lists; can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff, ARRAY_SIZE(dev_rcv_lists->rx_eff)); } } rcu_read_unlock(); seq_putc(m, '\n'); return 0; } /* * can_init_proc - create main CAN proc directory and procfs entries */ void can_init_proc(struct net *net) { /* create /proc/net/can directory */ net->can.proc_dir = proc_net_mkdir(net, "can", net->proc_net); if (!net->can.proc_dir) { printk(KERN_INFO "can: failed to create /proc/net/can . " "CONFIG_PROC_FS missing?\n"); return; } /* own procfs entries from the AF_CAN core */ net->can.pde_stats = proc_create_net_single(CAN_PROC_STATS, 0644, net->can.proc_dir, can_stats_proc_show, NULL); net->can.pde_reset_stats = proc_create_net_single(CAN_PROC_RESET_STATS, 0644, net->can.proc_dir, can_reset_stats_proc_show, NULL); net->can.pde_rcvlist_err = proc_create_net_single(CAN_PROC_RCVLIST_ERR, 0644, net->can.proc_dir, can_rcvlist_proc_show, (void *)RX_ERR); net->can.pde_rcvlist_all = proc_create_net_single(CAN_PROC_RCVLIST_ALL, 0644, net->can.proc_dir, can_rcvlist_proc_show, (void *)RX_ALL); net->can.pde_rcvlist_fil = proc_create_net_single(CAN_PROC_RCVLIST_FIL, 0644, net->can.proc_dir, can_rcvlist_proc_show, (void *)RX_FIL); net->can.pde_rcvlist_inv = proc_create_net_single(CAN_PROC_RCVLIST_INV, 0644, net->can.proc_dir, can_rcvlist_proc_show, (void *)RX_INV); net->can.pde_rcvlist_eff = proc_create_net_single(CAN_PROC_RCVLIST_EFF, 0644, net->can.proc_dir, can_rcvlist_eff_proc_show, NULL); net->can.pde_rcvlist_sff = proc_create_net_single(CAN_PROC_RCVLIST_SFF, 0644, net->can.proc_dir, can_rcvlist_sff_proc_show, NULL); } /* * can_remove_proc - remove procfs entries and main CAN proc directory */ void can_remove_proc(struct net *net) { if (!net->can.proc_dir) return; if (net->can.pde_stats) remove_proc_entry(CAN_PROC_STATS, net->can.proc_dir); if (net->can.pde_reset_stats) remove_proc_entry(CAN_PROC_RESET_STATS, net->can.proc_dir); if (net->can.pde_rcvlist_err) remove_proc_entry(CAN_PROC_RCVLIST_ERR, net->can.proc_dir); if (net->can.pde_rcvlist_all) remove_proc_entry(CAN_PROC_RCVLIST_ALL, net->can.proc_dir); if (net->can.pde_rcvlist_fil) remove_proc_entry(CAN_PROC_RCVLIST_FIL, net->can.proc_dir); if (net->can.pde_rcvlist_inv) remove_proc_entry(CAN_PROC_RCVLIST_INV, net->can.proc_dir); if (net->can.pde_rcvlist_eff) remove_proc_entry(CAN_PROC_RCVLIST_EFF, net->can.proc_dir); if (net->can.pde_rcvlist_sff) remove_proc_entry(CAN_PROC_RCVLIST_SFF, net->can.proc_dir); remove_proc_entry("can", net->proc_net); }
4 5 10 9 2 9 9 12 3 7 7 1 12 9 31 31 14 2 3 10 12 9 12 7 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/fs/fat/cache.c * * Written 1992,1993 by Werner Almesberger * * Mar 1999. AV. Changed cache, so that it uses the starting cluster instead * of inode number. * May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers. * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. */ #include <linux/slab.h> #include <asm/unaligned.h> #include <linux/buffer_head.h> #include "exfat_raw.h" #include "exfat_fs.h" #define EXFAT_MAX_CACHE 16 struct exfat_cache { struct list_head cache_list; unsigned int nr_contig; /* number of contiguous clusters */ unsigned int fcluster; /* cluster number in the file. */ unsigned int dcluster; /* cluster number on disk. */ }; struct exfat_cache_id { unsigned int id; unsigned int nr_contig; unsigned int fcluster; unsigned int dcluster; }; static struct kmem_cache *exfat_cachep; static void exfat_cache_init_once(void *c) { struct exfat_cache *cache = (struct exfat_cache *)c; INIT_LIST_HEAD(&cache->cache_list); } int exfat_cache_init(void) { exfat_cachep = kmem_cache_create("exfat_cache", sizeof(struct exfat_cache), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, exfat_cache_init_once); if (!exfat_cachep) return -ENOMEM; return 0; } void exfat_cache_shutdown(void) { if (!exfat_cachep) return; kmem_cache_destroy(exfat_cachep); } static inline struct exfat_cache *exfat_cache_alloc(void) { return kmem_cache_alloc(exfat_cachep, GFP_NOFS); } static inline void exfat_cache_free(struct exfat_cache *cache) { WARN_ON(!list_empty(&cache->cache_list)); kmem_cache_free(exfat_cachep, cache); } static inline void exfat_cache_update_lru(struct inode *inode, struct exfat_cache *cache) { struct exfat_inode_info *ei = EXFAT_I(inode); if (ei->cache_lru.next != &cache->cache_list) list_move(&cache->cache_list, &ei->cache_lru); } static unsigned int exfat_cache_lookup(struct inode *inode, unsigned int fclus, struct exfat_cache_id *cid, unsigned int *cached_fclus, unsigned int *cached_dclus) { struct exfat_inode_info *ei = EXFAT_I(inode); static struct exfat_cache nohit = { .fcluster = 0, }; struct exfat_cache *hit = &nohit, *p; unsigned int offset = EXFAT_EOF_CLUSTER; spin_lock(&ei->cache_lru_lock); list_for_each_entry(p, &ei->cache_lru, cache_list) { /* Find the cache of "fclus" or nearest cache. */ if (p->fcluster <= fclus && hit->fcluster < p->fcluster) { hit = p; if (hit->fcluster + hit->nr_contig < fclus) { offset = hit->nr_contig; } else { offset = fclus - hit->fcluster; break; } } } if (hit != &nohit) { exfat_cache_update_lru(inode, hit); cid->id = ei->cache_valid_id; cid->nr_contig = hit->nr_contig; cid->fcluster = hit->fcluster; cid->dcluster = hit->dcluster; *cached_fclus = cid->fcluster + offset; *cached_dclus = cid->dcluster + offset; } spin_unlock(&ei->cache_lru_lock); return offset; } static struct exfat_cache *exfat_cache_merge(struct inode *inode, struct exfat_cache_id *new) { struct exfat_inode_info *ei = EXFAT_I(inode); struct exfat_cache *p; list_for_each_entry(p, &ei->cache_lru, cache_list) { /* Find the same part as "new" in cluster-chain. */ if (p->fcluster == new->fcluster) { if (new->nr_contig > p->nr_contig) p->nr_contig = new->nr_contig; return p; } } return NULL; } static void exfat_cache_add(struct inode *inode, struct exfat_cache_id *new) { struct exfat_inode_info *ei = EXFAT_I(inode); struct exfat_cache *cache, *tmp; if (new->fcluster == EXFAT_EOF_CLUSTER) /* dummy cache */ return; spin_lock(&ei->cache_lru_lock); if (new->id != EXFAT_CACHE_VALID && new->id != ei->cache_valid_id) goto unlock; /* this cache was invalidated */ cache = exfat_cache_merge(inode, new); if (cache == NULL) { if (ei->nr_caches < EXFAT_MAX_CACHE) { ei->nr_caches++; spin_unlock(&ei->cache_lru_lock); tmp = exfat_cache_alloc(); if (!tmp) { spin_lock(&ei->cache_lru_lock); ei->nr_caches--; spin_unlock(&ei->cache_lru_lock); return; } spin_lock(&ei->cache_lru_lock); cache = exfat_cache_merge(inode, new); if (cache != NULL) { ei->nr_caches--; exfat_cache_free(tmp); goto out_update_lru; } cache = tmp; } else { struct list_head *p = ei->cache_lru.prev; cache = list_entry(p, struct exfat_cache, cache_list); } cache->fcluster = new->fcluster; cache->dcluster = new->dcluster; cache->nr_contig = new->nr_contig; } out_update_lru: exfat_cache_update_lru(inode, cache); unlock: spin_unlock(&ei->cache_lru_lock); } /* * Cache invalidation occurs rarely, thus the LRU chain is not updated. It * fixes itself after a while. */ static void __exfat_cache_inval_inode(struct inode *inode) { struct exfat_inode_info *ei = EXFAT_I(inode); struct exfat_cache *cache; while (!list_empty(&ei->cache_lru)) { cache = list_entry(ei->cache_lru.next, struct exfat_cache, cache_list); list_del_init(&cache->cache_list); ei->nr_caches--; exfat_cache_free(cache); } /* Update. The copy of caches before this id is discarded. */ ei->cache_valid_id++; if (ei->cache_valid_id == EXFAT_CACHE_VALID) ei->cache_valid_id++; } void exfat_cache_inval_inode(struct inode *inode) { struct exfat_inode_info *ei = EXFAT_I(inode); spin_lock(&ei->cache_lru_lock); __exfat_cache_inval_inode(inode); spin_unlock(&ei->cache_lru_lock); } static inline int cache_contiguous(struct exfat_cache_id *cid, unsigned int dclus) { cid->nr_contig++; return cid->dcluster + cid->nr_contig == dclus; } static inline void cache_init(struct exfat_cache_id *cid, unsigned int fclus, unsigned int dclus) { cid->id = EXFAT_CACHE_VALID; cid->fcluster = fclus; cid->dcluster = dclus; cid->nr_contig = 0; } int exfat_get_cluster(struct inode *inode, unsigned int cluster, unsigned int *fclus, unsigned int *dclus, unsigned int *last_dclus, int allow_eof) { struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned int limit = sbi->num_clusters; struct exfat_inode_info *ei = EXFAT_I(inode); struct exfat_cache_id cid; unsigned int content; if (ei->start_clu == EXFAT_FREE_CLUSTER) { exfat_fs_error(sb, "invalid access to exfat cache (entry 0x%08x)", ei->start_clu); return -EIO; } *fclus = 0; *dclus = ei->start_clu; *last_dclus = *dclus; /* * Don`t use exfat_cache if zero offset or non-cluster allocation */ if (cluster == 0 || *dclus == EXFAT_EOF_CLUSTER) return 0; cache_init(&cid, EXFAT_EOF_CLUSTER, EXFAT_EOF_CLUSTER); if (exfat_cache_lookup(inode, cluster, &cid, fclus, dclus) == EXFAT_EOF_CLUSTER) { /* * dummy, always not contiguous * This is reinitialized by cache_init(), later. */ WARN_ON(cid.id != EXFAT_CACHE_VALID || cid.fcluster != EXFAT_EOF_CLUSTER || cid.dcluster != EXFAT_EOF_CLUSTER || cid.nr_contig != 0); } if (*fclus == cluster) return 0; while (*fclus < cluster) { /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { exfat_fs_error(sb, "detected the cluster chain loop (i_pos %u)", (*fclus)); return -EIO; } if (exfat_ent_get(sb, *dclus, &content)) return -EIO; *last_dclus = *dclus; *dclus = content; (*fclus)++; if (content == EXFAT_EOF_CLUSTER) { if (!allow_eof) { exfat_fs_error(sb, "invalid cluster chain (i_pos %u, last_clus 0x%08x is EOF)", *fclus, (*last_dclus)); return -EIO; } break; } if (!cache_contiguous(&cid, *dclus)) cache_init(&cid, *fclus, *dclus); } exfat_cache_add(inode, &cid); return 0; }
13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_MROUTE6_H #define __LINUX_MROUTE6_H #include <linux/pim.h> #include <linux/skbuff.h> /* for struct sk_buff_head */ #include <net/net_namespace.h> #include <uapi/linux/mroute6.h> #include <linux/mroute_base.h> #include <linux/sockptr.h> #include <net/fib_rules.h> #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) { return (opt >= MRT6_BASE) && (opt <= MRT6_MAX); } #else static inline int ip6_mroute_opt(int opt) { return 0; } #endif struct sock; #ifdef CONFIG_IPV6_MROUTE extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mr_input(struct sk_buff *skb); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); extern int ip6_mr_init(void); extern void ip6_mr_cleanup(void); int ip6mr_ioctl(struct sock *sk, int cmd, void *arg); #else static inline int ip6_mroute_setsockopt(struct sock *sock, int optname, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } static inline int ip6_mroute_getsockopt(struct sock *sock, int optname, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { return -ENOIOCTLCMD; } static inline int ip6_mr_init(void) { return 0; } static inline void ip6_mr_cleanup(void) { return; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES bool ip6mr_rule_default(const struct fib_rule *rule); #else static inline bool ip6mr_rule_default(const struct fib_rule *rule) { return true; } #endif #define VIFF_STATIC 0x8000 struct mfc6_cache_cmp_arg { struct in6_addr mf6c_mcastgrp; struct in6_addr mf6c_origin; }; struct mfc6_cache { struct mr_mfc _c; union { struct { struct in6_addr mf6c_mcastgrp; struct in6_addr mf6c_origin; }; struct mfc6_cache_cmp_arg cmparg; }; }; #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ struct rtmsg; extern int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid); #ifdef CONFIG_IPV6_MROUTE bool mroute6_is_socket(struct net *net, struct sk_buff *skb); extern int ip6mr_sk_done(struct sock *sk); static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { switch (cmd) { /* These userspace buffers will be consumed by ip6mr_ioctl() */ case SIOCGETMIFCNT_IN6: { struct sioc_mif_req6 buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } case SIOCGETSGCNT_IN6: { struct sioc_sg_req6 buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } } return 1; } #else static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { return false; } static inline int ip6mr_sk_done(struct sock *sk) { return 0; } static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { return 1; } #endif #endif
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2018 Christoph Hellwig. * * DMA operations that map physical memory directly without using an IOMMU. */ #ifndef _KERNEL_DMA_DIRECT_H #define _KERNEL_DMA_DIRECT_H #include <linux/dma-direct.h> #include <linux/memremap.h> int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); bool dma_direct_can_mmap(struct device *dev); int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs); bool dma_direct_all_ram_mapped(struct device *dev); size_t dma_direct_max_mapping_size(struct device *dev); #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir); #else static inline void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { } #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ defined(CONFIG_SWIOTLB) void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs); void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir); #else static inline void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { } static inline void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { } #endif static inline void dma_direct_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { phys_addr_t paddr = dma_to_phys(dev, addr); if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_device(dev, paddr, size, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, size, dir); } static inline void dma_direct_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { phys_addr_t paddr = dma_to_phys(dev, addr); if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(paddr, size, dir); arch_sync_dma_for_cpu_all(); } if (unlikely(is_swiotlb_buffer(dev, paddr))) swiotlb_sync_single_for_cpu(dev, paddr, size, dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, size); } static inline dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); if (is_swiotlb_force_bounce(dev)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); } if (unlikely(!dma_capable(dev, dma_addr, size, true)) || dma_kmalloc_needs_bounce(dev, size, dir)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; if (is_swiotlb_active(dev)) return swiotlb_map(dev, phys, size, dir, attrs); dev_WARN_ONCE(dev, 1, "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) arch_sync_dma_for_device(phys, size, dir); return dma_addr; } static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { phys_addr_t phys = dma_to_phys(dev, addr); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(dev, phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); } #endif /* _KERNEL_DMA_DIRECT_H */
22 2 22 3 3 3 3 3 3 3 3 1 3 3 1 2 3 22 22 22 22 2 16 22 21 22 2 22 22 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH * Copyright 2018-2020, 2022-2023 Intel Corporation */ #include <crypto/utils.h> #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/export.h> #include <net/mac80211.h> #include <asm/unaligned.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "debugfs_key.h" #include "aes_ccm.h" #include "aes_cmac.h" #include "aes_gmac.h" #include "aes_gcm.h" /** * DOC: Key handling basics * * Key handling in mac80211 is done based on per-interface (sub_if_data) * keys and per-station keys. Since each station belongs to an interface, * each station key also belongs to that interface. * * Hardware acceleration is done on a best-effort basis for algorithms * that are implemented in software, for each key the hardware is asked * to enable that key for offloading but if it cannot do that the key is * simply kept for software encryption (unless it is for an algorithm * that isn't implemented in software). * There is currently no way of knowing whether a key is handled in SW * or HW except by looking into debugfs. * * All key management is internally protected by a mutex. Within all * other parts of mac80211, key references are, just as STA structure * references, protected by RCU. Note, however, that some things are * unprotected, namely the key->sta dereferences within the hardware * acceleration functions. This means that sta_info_destroy() must * remove the key which waits for an RCU grace period. */ static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; static void update_vlan_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) { struct ieee80211_sub_if_data *vlan; if (sdata->vif.type != NL80211_IFTYPE_AP) return; /* crypto_tx_tailroom_needed_cnt is protected by this */ lockdep_assert_wiphy(sdata->local->hw.wiphy); rcu_read_lock(); list_for_each_entry_rcu(vlan, &sdata->u.ap.vlans, u.vlan.list) vlan->crypto_tx_tailroom_needed_cnt += delta; rcu_read_unlock(); } static void increment_tailroom_need_count(struct ieee80211_sub_if_data *sdata) { /* * When this count is zero, SKB resizing for allocating tailroom * for IV or MMIC is skipped. But, this check has created two race * cases in xmit path while transiting from zero count to one: * * 1. SKB resize was skipped because no key was added but just before * the xmit key is added and SW encryption kicks off. * * 2. SKB resize was skipped because all the keys were hw planted but * just before xmit one of the key is deleted and SW encryption kicks * off. * * In both the above case SW encryption will find not enough space for * tailroom and exits with WARN_ON. (See WARN_ONs at wpa.c) * * Solution has been explained at * http://mid.gmane.org/1308590980.4322.19.camel@jlt3.sipsolutions.net */ lockdep_assert_wiphy(sdata->local->hw.wiphy); update_vlan_tailroom_need_count(sdata, 1); if (!sdata->crypto_tx_tailroom_needed_cnt++) { /* * Flush all XMIT packets currently using HW encryption or no * encryption at all if the count transition is from 0 -> 1. */ synchronize_net(); } } static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata, int delta) { lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt < delta); update_vlan_tailroom_need_count(sdata, -delta); sdata->crypto_tx_tailroom_needed_cnt -= delta; } static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata = key->sdata; struct sta_info *sta; int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(key->local->hw.wiphy); if (key->flags & KEY_FLAG_TAINTED) { /* If we get here, it's during resume and the key is * tainted so shouldn't be used/programmed any more. * However, its flags may still indicate that it was * programmed into the device (since we're in resume) * so clear that flag now to avoid trying to remove * it again later. */ if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE && !(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE | IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(sdata); key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; return -EINVAL; } if (!key->local->ops->set_key) goto out_unsupported; sta = key->sta; /* * If this is a per-STA GTK, check if it * is supported; if not, return. */ if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) && !ieee80211_hw_check(&key->local->hw, SUPPORTS_PER_STA_GTK)) goto out_unsupported; if (sta && !sta->uploaded) goto out_unsupported; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { /* * The driver doesn't know anything about VLAN interfaces. * Hence, don't send GTKs for VLAN interfaces to the driver. */ if (!(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) { ret = 1; goto out_unsupported; } } if (key->conf.link_id >= 0 && sdata->vif.active_links && !(sdata->vif.active_links & BIT(key->conf.link_id))) return 0; ret = drv_set_key(key->local, SET_KEY, sdata, sta ? &sta->sta : NULL, &key->conf); if (!ret) { key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE | IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) decrease_tailroom_need_count(sdata, 1); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)); WARN_ON((key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) && (key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIC)); return 0; } if (ret != -ENOSPC && ret != -EOPNOTSUPP && ret != 1) sdata_err(sdata, "failed to set key (%d, %pM) to hardware (%d)\n", key->conf.keyidx, sta ? sta->sta.addr : bcast_addr, ret); out_unsupported: switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: /* all of these we can do in software - if driver can */ if (ret == 1) return 0; if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) return -EINVAL; return 0; default: return -EINVAL; } } static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata; struct sta_info *sta; int ret; might_sleep(); if (!key || !key->local->ops->set_key) return; if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) return; sta = key->sta; sdata = key->sdata; lockdep_assert_wiphy(key->local->hw.wiphy); if (key->conf.link_id >= 0 && sdata->vif.active_links && !(sdata->vif.active_links & BIT(key->conf.link_id))) return; if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE | IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(sdata); key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; ret = drv_set_key(key->local, DISABLE_KEY, sdata, sta ? &sta->sta : NULL, &key->conf); if (ret) sdata_err(sdata, "failed to remove key (%d, %pM) from hardware (%d)\n", key->conf.keyidx, sta ? sta->sta.addr : bcast_addr, ret); } static int _ieee80211_set_tx_key(struct ieee80211_key *key, bool force) { struct sta_info *sta = key->sta; struct ieee80211_local *local = key->local; lockdep_assert_wiphy(local->hw.wiphy); set_sta_flag(sta, WLAN_STA_USES_ENCRYPTION); sta->ptk_idx = key->conf.keyidx; if (force || !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_check_fast_xmit(sta); return 0; } int ieee80211_set_tx_key(struct ieee80211_key *key) { return _ieee80211_set_tx_key(key, false); } static void ieee80211_pairwise_rekey(struct ieee80211_key *old, struct ieee80211_key *new) { struct ieee80211_local *local = new->local; struct sta_info *sta = new->sta; int i; lockdep_assert_wiphy(local->hw.wiphy); if (new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX) { /* Extended Key ID key install, initial one or rekey */ if (sta->ptk_idx != INVALID_PTK_KEYIDX && !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT)) { /* Aggregation Sessions with Extended Key ID must not * mix MPDUs with different keyIDs within one A-MPDU. * Tear down running Tx aggregation sessions and block * new Rx/Tx aggregation requests during rekey to * ensure there are no A-MPDUs when the driver is not * supporting A-MPDU key borders. (Blocking Tx only * would be sufficient but WLAN_STA_BLOCK_BA gets the * job done for the few ms we need it.) */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); for (i = 0; i < IEEE80211_NUM_TIDS; i++) __ieee80211_stop_tx_ba_session(sta, i, AGG_STOP_LOCAL_REQUEST); } } else if (old) { /* Rekey without Extended Key ID. * Aggregation sessions are OK when running on SW crypto. * A broken remote STA may cause issues not observed with HW * crypto, though. */ if (!(old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) return; /* Stop Tx till we are on the new key */ old->flags |= KEY_FLAG_TAINTED; ieee80211_clear_fast_xmit(sta); if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION)) { set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_LOCAL_REQUEST); } if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0)) { pr_warn_ratelimited("Rekeying PTK for STA %pM but driver can't safely do that.", sta->sta.addr); /* Flushing the driver queues *may* help prevent * the clear text leaks and freezes. */ ieee80211_flush_queues(local, old->sdata, false); } } } static void __ieee80211_set_default_key(struct ieee80211_link_data *link, int idx, bool uni, bool multi) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_key *key = NULL; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (idx >= 0 && idx < NUM_DEFAULT_KEYS) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->keys[idx]); if (!key) key = wiphy_dereference(sdata->local->hw.wiphy, link->gtk[idx]); } if (uni) { rcu_assign_pointer(sdata->default_unicast_key, key); ieee80211_check_fast_xmit_iface(sdata); if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN) drv_set_default_unicast_key(sdata->local, sdata, idx); } if (multi) rcu_assign_pointer(link->default_multicast_key, key); ieee80211_debugfs_key_update_default(sdata); } void ieee80211_set_default_key(struct ieee80211_link_data *link, int idx, bool uni, bool multi) { lockdep_assert_wiphy(link->sdata->local->hw.wiphy); __ieee80211_set_default_key(link, idx, uni, multi); } static void __ieee80211_set_default_mgmt_key(struct ieee80211_link_data *link, int idx) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_key *key = NULL; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (idx >= NUM_DEFAULT_KEYS && idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) key = wiphy_dereference(sdata->local->hw.wiphy, link->gtk[idx]); rcu_assign_pointer(link->default_mgmt_key, key); ieee80211_debugfs_key_update_default(sdata); } void ieee80211_set_default_mgmt_key(struct ieee80211_link_data *link, int idx) { lockdep_assert_wiphy(link->sdata->local->hw.wiphy); __ieee80211_set_default_mgmt_key(link, idx); } static void __ieee80211_set_default_beacon_key(struct ieee80211_link_data *link, int idx) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_key *key = NULL; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS && idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS) key = wiphy_dereference(sdata->local->hw.wiphy, link->gtk[idx]); rcu_assign_pointer(link->default_beacon_key, key); ieee80211_debugfs_key_update_default(sdata); } void ieee80211_set_default_beacon_key(struct ieee80211_link_data *link, int idx) { lockdep_assert_wiphy(link->sdata->local->hw.wiphy); __ieee80211_set_default_beacon_key(link, idx); } static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct sta_info *sta, bool pairwise, struct ieee80211_key *old, struct ieee80211_key *new) { struct link_sta_info *link_sta = sta ? &sta->deflink : NULL; int link_id; int idx; int ret = 0; bool defunikey, defmultikey, defmgmtkey, defbeaconkey; bool is_wep; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* caller must provide at least one old/new */ if (WARN_ON(!new && !old)) return 0; if (new) { idx = new->conf.keyidx; is_wep = new->conf.cipher == WLAN_CIPHER_SUITE_WEP40 || new->conf.cipher == WLAN_CIPHER_SUITE_WEP104; link_id = new->conf.link_id; } else { idx = old->conf.keyidx; is_wep = old->conf.cipher == WLAN_CIPHER_SUITE_WEP40 || old->conf.cipher == WLAN_CIPHER_SUITE_WEP104; link_id = old->conf.link_id; } if (WARN(old && old->conf.link_id != link_id, "old link ID %d doesn't match new link ID %d\n", old->conf.link_id, link_id)) return -EINVAL; if (link_id >= 0) { if (!link) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) return -ENOLINK; } if (sta) { link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&sta->local->hw.wiphy->mtx)); if (!link_sta) return -ENOLINK; } } else { link = &sdata->deflink; } if ((is_wep || pairwise) && idx >= NUM_DEFAULT_KEYS) return -EINVAL; WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); if (new && sta && pairwise) { /* Unicast rekey needs special handling. With Extended Key ID * old is still NULL for the first rekey. */ ieee80211_pairwise_rekey(old, new); } if (old) { if (old->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { ieee80211_key_disable_hw_accel(old); if (new) ret = ieee80211_key_enable_hw_accel(new); } } else { if (!new->local->wowlan) ret = ieee80211_key_enable_hw_accel(new); else new->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; } if (ret) return ret; if (new) list_add_tail_rcu(&new->list, &sdata->key_list); if (sta) { if (pairwise) { rcu_assign_pointer(sta->ptk[idx], new); if (new && !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) _ieee80211_set_tx_key(new, true); } else { rcu_assign_pointer(link_sta->gtk[idx], new); } /* Only needed for transition from no key -> key. * Still triggers unnecessary when using Extended Key ID * and installing the second key ID the first time. */ if (new && !old) ieee80211_check_fast_rx(sta); } else { defunikey = old && old == wiphy_dereference(sdata->local->hw.wiphy, sdata->default_unicast_key); defmultikey = old && old == wiphy_dereference(sdata->local->hw.wiphy, link->default_multicast_key); defmgmtkey = old && old == wiphy_dereference(sdata->local->hw.wiphy, link->default_mgmt_key); defbeaconkey = old && old == wiphy_dereference(sdata->local->hw.wiphy, link->default_beacon_key); if (defunikey && !new) __ieee80211_set_default_key(link, -1, true, false); if (defmultikey && !new) __ieee80211_set_default_key(link, -1, false, true); if (defmgmtkey && !new) __ieee80211_set_default_mgmt_key(link, -1); if (defbeaconkey && !new) __ieee80211_set_default_beacon_key(link, -1); if (is_wep || pairwise) rcu_assign_pointer(sdata->keys[idx], new); else rcu_assign_pointer(link->gtk[idx], new); if (defunikey && new) __ieee80211_set_default_key(link, new->conf.keyidx, true, false); if (defmultikey && new) __ieee80211_set_default_key(link, new->conf.keyidx, false, true); if (defmgmtkey && new) __ieee80211_set_default_mgmt_key(link, new->conf.keyidx); if (defbeaconkey && new) __ieee80211_set_default_beacon_key(link, new->conf.keyidx); } if (old) list_del_rcu(&old->list); return 0; } struct ieee80211_key * ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, const u8 *key_data, size_t seq_len, const u8 *seq) { struct ieee80211_key *key; int i, j, err; if (WARN_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + NUM_DEFAULT_BEACON_KEYS)) return ERR_PTR(-EINVAL); key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL); if (!key) return ERR_PTR(-ENOMEM); /* * Default to software encryption; we'll later upload the * key to the hardware if possible. */ key->conf.flags = 0; key->flags = 0; key->conf.link_id = -1; key->conf.cipher = cipher; key->conf.keyidx = idx; key->conf.keylen = key_len; switch (cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: key->conf.iv_len = IEEE80211_WEP_IV_LEN; key->conf.icv_len = IEEE80211_WEP_ICV_LEN; break; case WLAN_CIPHER_SUITE_TKIP: key->conf.iv_len = IEEE80211_TKIP_IV_LEN; key->conf.icv_len = IEEE80211_TKIP_ICV_LEN; if (seq) { for (i = 0; i < IEEE80211_NUM_TIDS; i++) { key->u.tkip.rx[i].iv32 = get_unaligned_le32(&seq[2]); key->u.tkip.rx[i].iv16 = get_unaligned_le16(seq); } } spin_lock_init(&key->u.tkip.txlock); break; case WLAN_CIPHER_SUITE_CCMP: key->conf.iv_len = IEEE80211_CCMP_HDR_LEN; key->conf.icv_len = IEEE80211_CCMP_MIC_LEN; if (seq) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) for (j = 0; j < IEEE80211_CCMP_PN_LEN; j++) key->u.ccmp.rx_pn[i][j] = seq[IEEE80211_CCMP_PN_LEN - j - 1]; } /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt( key_data, key_len, IEEE80211_CCMP_MIC_LEN); if (IS_ERR(key->u.ccmp.tfm)) { err = PTR_ERR(key->u.ccmp.tfm); kfree(key); return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_CCMP_256: key->conf.iv_len = IEEE80211_CCMP_256_HDR_LEN; key->conf.icv_len = IEEE80211_CCMP_256_MIC_LEN; for (i = 0; seq && i < IEEE80211_NUM_TIDS + 1; i++) for (j = 0; j < IEEE80211_CCMP_256_PN_LEN; j++) key->u.ccmp.rx_pn[i][j] = seq[IEEE80211_CCMP_256_PN_LEN - j - 1]; /* Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt( key_data, key_len, IEEE80211_CCMP_256_MIC_LEN); if (IS_ERR(key->u.ccmp.tfm)) { err = PTR_ERR(key->u.ccmp.tfm); kfree(key); return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: key->conf.iv_len = 0; if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) key->conf.icv_len = sizeof(struct ieee80211_mmie); else key->conf.icv_len = sizeof(struct ieee80211_mmie_16); if (seq) for (j = 0; j < IEEE80211_CMAC_PN_LEN; j++) key->u.aes_cmac.rx_pn[j] = seq[IEEE80211_CMAC_PN_LEN - j - 1]; /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.aes_cmac.tfm = ieee80211_aes_cmac_key_setup(key_data, key_len); if (IS_ERR(key->u.aes_cmac.tfm)) { err = PTR_ERR(key->u.aes_cmac.tfm); kfree(key); return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: key->conf.iv_len = 0; key->conf.icv_len = sizeof(struct ieee80211_mmie_16); if (seq) for (j = 0; j < IEEE80211_GMAC_PN_LEN; j++) key->u.aes_gmac.rx_pn[j] = seq[IEEE80211_GMAC_PN_LEN - j - 1]; /* Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.aes_gmac.tfm = ieee80211_aes_gmac_key_setup(key_data, key_len); if (IS_ERR(key->u.aes_gmac.tfm)) { err = PTR_ERR(key->u.aes_gmac.tfm); kfree(key); return ERR_PTR(err); } break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: key->conf.iv_len = IEEE80211_GCMP_HDR_LEN; key->conf.icv_len = IEEE80211_GCMP_MIC_LEN; for (i = 0; seq && i < IEEE80211_NUM_TIDS + 1; i++) for (j = 0; j < IEEE80211_GCMP_PN_LEN; j++) key->u.gcmp.rx_pn[i][j] = seq[IEEE80211_GCMP_PN_LEN - j - 1]; /* Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.gcmp.tfm = ieee80211_aes_gcm_key_setup_encrypt(key_data, key_len); if (IS_ERR(key->u.gcmp.tfm)) { err = PTR_ERR(key->u.gcmp.tfm); kfree(key); return ERR_PTR(err); } break; } memcpy(key->conf.key, key_data, key_len); INIT_LIST_HEAD(&key->list); return key; } static void ieee80211_key_free_common(struct ieee80211_key *key) { switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: ieee80211_aes_key_free(key->u.ccmp.tfm); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: ieee80211_aes_gmac_key_free(key->u.aes_gmac.tfm); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: ieee80211_aes_gcm_key_free(key->u.gcmp.tfm); break; } kfree_sensitive(key); } static void __ieee80211_key_destroy(struct ieee80211_key *key, bool delay_tailroom) { if (key->local) { struct ieee80211_sub_if_data *sdata = key->sdata; ieee80211_debugfs_key_remove(key); if (delay_tailroom) { /* see ieee80211_delayed_tailroom_dec */ sdata->crypto_tx_tailroom_pending_dec++; wiphy_delayed_work_queue(sdata->local->hw.wiphy, &sdata->dec_tailroom_needed_wk, HZ / 2); } else { decrease_tailroom_need_count(sdata, 1); } } ieee80211_key_free_common(key); } static void ieee80211_key_destroy(struct ieee80211_key *key, bool delay_tailroom) { if (!key) return; /* * Synchronize so the TX path and rcu key iterators * can no longer be using this key before we free/remove it. */ synchronize_net(); __ieee80211_key_destroy(key, delay_tailroom); } void ieee80211_key_free_unused(struct ieee80211_key *key) { if (!key) return; WARN_ON(key->sdata || key->local); ieee80211_key_free_common(key); } static bool ieee80211_key_identical(struct ieee80211_sub_if_data *sdata, struct ieee80211_key *old, struct ieee80211_key *new) { u8 tkip_old[WLAN_KEY_LEN_TKIP], tkip_new[WLAN_KEY_LEN_TKIP]; u8 *tk_old, *tk_new; if (!old || new->conf.keylen != old->conf.keylen) return false; tk_old = old->conf.key; tk_new = new->conf.key; /* * In station mode, don't compare the TX MIC key, as it's never used * and offloaded rekeying may not care to send it to the host. This * is the case in iwlwifi, for example. */ if (sdata->vif.type == NL80211_IFTYPE_STATION && new->conf.cipher == WLAN_CIPHER_SUITE_TKIP && new->conf.keylen == WLAN_KEY_LEN_TKIP && !(new->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE)) { memcpy(tkip_old, tk_old, WLAN_KEY_LEN_TKIP); memcpy(tkip_new, tk_new, WLAN_KEY_LEN_TKIP); memset(tkip_old + NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY, 0, 8); memset(tkip_new + NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY, 0, 8); tk_old = tkip_old; tk_new = tkip_new; } return !crypto_memneq(tk_old, tk_new, new->conf.keylen); } int ieee80211_key_link(struct ieee80211_key *key, struct ieee80211_link_data *link, struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = link->sdata; static atomic_t key_color = ATOMIC_INIT(0); struct ieee80211_key *old_key = NULL; int idx = key->conf.keyidx; bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; /* * We want to delay tailroom updates only for station - in that * case it helps roaming speed, but in other cases it hurts and * can cause warnings to appear. */ bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); if (sta && pairwise) { struct ieee80211_key *alt_key; old_key = wiphy_dereference(sdata->local->hw.wiphy, sta->ptk[idx]); alt_key = wiphy_dereference(sdata->local->hw.wiphy, sta->ptk[idx ^ 1]); /* The rekey code assumes that the old and new key are using * the same cipher. Enforce the assumption for pairwise keys. */ if ((alt_key && alt_key->conf.cipher != key->conf.cipher) || (old_key && old_key->conf.cipher != key->conf.cipher)) { ret = -EOPNOTSUPP; goto out; } } else if (sta) { struct link_sta_info *link_sta = &sta->deflink; int link_id = key->conf.link_id; if (link_id >= 0) { link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&sta->local->hw.wiphy->mtx)); if (!link_sta) { ret = -ENOLINK; goto out; } } old_key = wiphy_dereference(sdata->local->hw.wiphy, link_sta->gtk[idx]); } else { if (idx < NUM_DEFAULT_KEYS) old_key = wiphy_dereference(sdata->local->hw.wiphy, sdata->keys[idx]); if (!old_key) old_key = wiphy_dereference(sdata->local->hw.wiphy, link->gtk[idx]); } /* Non-pairwise keys must also not switch the cipher on rekey */ if (!pairwise) { if (old_key && old_key->conf.cipher != key->conf.cipher) { ret = -EOPNOTSUPP; goto out; } } /* * Silently accept key re-installation without really installing the * new version of the key to avoid nonce reuse or replay issues. */ if (ieee80211_key_identical(sdata, old_key, key)) { ret = -EALREADY; goto out; } key->local = sdata->local; key->sdata = sdata; key->sta = sta; /* * Assign a unique ID to every key so we can easily prevent mixed * key and fragment cache attacks. */ key->color = atomic_inc_return(&key_color); increment_tailroom_need_count(sdata); ret = ieee80211_key_replace(sdata, link, sta, pairwise, old_key, key); if (!ret) { ieee80211_debugfs_key_add(key); ieee80211_key_destroy(old_key, delay_tailroom); } else { ieee80211_key_free(key, delay_tailroom); } key = NULL; out: ieee80211_key_free_unused(key); return ret; } void ieee80211_key_free(struct ieee80211_key *key, bool delay_tailroom) { if (!key) return; /* * Replace key with nothingness if it was ever used. */ if (key->sdata) ieee80211_key_replace(key->sdata, NULL, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); ieee80211_key_destroy(key, delay_tailroom); } void ieee80211_reenable_keys(struct ieee80211_sub_if_data *sdata) { struct ieee80211_key *key; struct ieee80211_sub_if_data *vlan; lockdep_assert_wiphy(sdata->local->hw.wiphy); sdata->crypto_tx_tailroom_needed_cnt = 0; sdata->crypto_tx_tailroom_pending_dec = 0; if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { vlan->crypto_tx_tailroom_needed_cnt = 0; vlan->crypto_tx_tailroom_pending_dec = 0; } } if (ieee80211_sdata_running(sdata)) { list_for_each_entry(key, &sdata->key_list, list) { increment_tailroom_need_count(sdata); ieee80211_key_enable_hw_accel(key); } } } void ieee80211_iter_keys(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_key *key, *tmp; struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(hw->wiphy); if (vif) { sdata = vif_to_sdata(vif); list_for_each_entry_safe(key, tmp, &sdata->key_list, list) iter(hw, &sdata->vif, key->sta ? &key->sta->sta : NULL, &key->conf, iter_data); } else { list_for_each_entry(sdata, &local->interfaces, list) list_for_each_entry_safe(key, tmp, &sdata->key_list, list) iter(hw, &sdata->vif, key->sta ? &key->sta->sta : NULL, &key->conf, iter_data); } } EXPORT_SYMBOL(ieee80211_iter_keys); static void _ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, struct ieee80211_sub_if_data *sdata, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data) { struct ieee80211_key *key; list_for_each_entry_rcu(key, &sdata->key_list, list) { /* skip keys of station in removal process */ if (key->sta && key->sta->removed) continue; if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)) continue; iter(hw, &sdata->vif, key->sta ? &key->sta->sta : NULL, &key->conf, iter_data); } } void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void (*iter)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key, void *data), void *iter_data) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; if (vif) { sdata = vif_to_sdata(vif); _ieee80211_iter_keys_rcu(hw, sdata, iter, iter_data); } else { list_for_each_entry_rcu(sdata, &local->interfaces, list) _ieee80211_iter_keys_rcu(hw, sdata, iter, iter_data); } } EXPORT_SYMBOL(ieee80211_iter_keys_rcu); static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata, struct list_head *keys) { struct ieee80211_key *key, *tmp; decrease_tailroom_need_count(sdata, sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; ieee80211_debugfs_key_remove_mgmt_default(sdata); ieee80211_debugfs_key_remove_beacon_default(sdata); list_for_each_entry_safe(key, tmp, &sdata->key_list, list) { ieee80211_key_replace(key->sdata, NULL, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); list_add_tail(&key->list, keys); } ieee80211_debugfs_key_update_default(sdata); } void ieee80211_remove_link_keys(struct ieee80211_link_data *link, struct list_head *keys) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_key *key, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(key, tmp, &sdata->key_list, list) { if (key->conf.link_id != link->link_id) continue; ieee80211_key_replace(key->sdata, link, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); list_add_tail(&key->list, keys); } } void ieee80211_free_key_list(struct ieee80211_local *local, struct list_head *keys) { struct ieee80211_key *key, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(key, tmp, keys, list) __ieee80211_key_destroy(key, false); } void ieee80211_free_keys(struct ieee80211_sub_if_data *sdata, bool force_synchronize) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *vlan; struct ieee80211_sub_if_data *master; struct ieee80211_key *key, *tmp; LIST_HEAD(keys); wiphy_delayed_work_cancel(local->hw.wiphy, &sdata->dec_tailroom_needed_wk); lockdep_assert_wiphy(local->hw.wiphy); ieee80211_free_keys_iface(sdata, &keys); if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) ieee80211_free_keys_iface(vlan, &keys); } if (!list_empty(&keys) || force_synchronize) synchronize_net(); list_for_each_entry_safe(key, tmp, &keys, list) __ieee80211_key_destroy(key, false); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (sdata->bss) { master = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt != master->crypto_tx_tailroom_needed_cnt); } } else { WARN_ON_ONCE(sdata->crypto_tx_tailroom_needed_cnt || sdata->crypto_tx_tailroom_pending_dec); } if (sdata->vif.type == NL80211_IFTYPE_AP) { list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) WARN_ON_ONCE(vlan->crypto_tx_tailroom_needed_cnt || vlan->crypto_tx_tailroom_pending_dec); } } void ieee80211_free_sta_keys(struct ieee80211_local *local, struct sta_info *sta) { struct ieee80211_key *key; int i; lockdep_assert_wiphy(local->hw.wiphy); for (i = 0; i < ARRAY_SIZE(sta->deflink.gtk); i++) { key = wiphy_dereference(local->hw.wiphy, sta->deflink.gtk[i]); if (!key) continue; ieee80211_key_replace(key->sdata, NULL, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); __ieee80211_key_destroy(key, key->sdata->vif.type == NL80211_IFTYPE_STATION); } for (i = 0; i < NUM_DEFAULT_KEYS; i++) { key = wiphy_dereference(local->hw.wiphy, sta->ptk[i]); if (!key) continue; ieee80211_key_replace(key->sdata, NULL, key->sta, key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, key, NULL); __ieee80211_key_destroy(key, key->sdata->vif.type == NL80211_IFTYPE_STATION); } } void ieee80211_delayed_tailroom_dec(struct wiphy *wiphy, struct wiphy_work *wk) { struct ieee80211_sub_if_data *sdata; sdata = container_of(wk, struct ieee80211_sub_if_data, dec_tailroom_needed_wk.work); /* * The reason for the delayed tailroom needed decrementing is to * make roaming faster: during roaming, all keys are first deleted * and then new keys are installed. The first new key causes the * crypto_tx_tailroom_needed_cnt to go from 0 to 1, which invokes * the cost of synchronize_net() (which can be slow). Avoid this * by deferring the crypto_tx_tailroom_needed_cnt decrementing on * key removal for a while, so if we roam the value is larger than * zero and no 0->1 transition happens. * * The cost is that if the AP switching was from an AP with keys * to one without, we still allocate tailroom while it would no * longer be needed. However, in the typical (fast) roaming case * within an ESS this usually won't happen. */ decrease_tailroom_need_count(sdata, sdata->crypto_tx_tailroom_pending_dec); sdata->crypto_tx_tailroom_pending_dec = 0; } void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid, const u8 *replay_ctr, gfp_t gfp) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); trace_api_gtk_rekey_notify(sdata, bssid, replay_ctr); cfg80211_gtk_rekey_notify(sdata->dev, bssid, replay_ctr, gfp); } EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_notify); void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq) { struct ieee80211_key *key; const u8 *pn; key = container_of(keyconf, struct ieee80211_key, conf); switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: if (WARN_ON(tid < 0 || tid >= IEEE80211_NUM_TIDS)) return; seq->tkip.iv32 = key->u.tkip.rx[tid].iv32; seq->tkip.iv16 = key->u.tkip.rx[tid].iv16; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) return; if (tid < 0) pn = key->u.ccmp.rx_pn[IEEE80211_NUM_TIDS]; else pn = key->u.ccmp.rx_pn[tid]; memcpy(seq->ccmp.pn, pn, IEEE80211_CCMP_PN_LEN); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (WARN_ON(tid != 0)) return; pn = key->u.aes_cmac.rx_pn; memcpy(seq->aes_cmac.pn, pn, IEEE80211_CMAC_PN_LEN); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (WARN_ON(tid != 0)) return; pn = key->u.aes_gmac.rx_pn; memcpy(seq->aes_gmac.pn, pn, IEEE80211_GMAC_PN_LEN); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) return; if (tid < 0) pn = key->u.gcmp.rx_pn[IEEE80211_NUM_TIDS]; else pn = key->u.gcmp.rx_pn[tid]; memcpy(seq->gcmp.pn, pn, IEEE80211_GCMP_PN_LEN); break; } } EXPORT_SYMBOL(ieee80211_get_key_rx_seq); void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq) { struct ieee80211_key *key; u8 *pn; key = container_of(keyconf, struct ieee80211_key, conf); switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: if (WARN_ON(tid < 0 || tid >= IEEE80211_NUM_TIDS)) return; key->u.tkip.rx[tid].iv32 = seq->tkip.iv32; key->u.tkip.rx[tid].iv16 = seq->tkip.iv16; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) return; if (tid < 0) pn = key->u.ccmp.rx_pn[IEEE80211_NUM_TIDS]; else pn = key->u.ccmp.rx_pn[tid]; memcpy(pn, seq->ccmp.pn, IEEE80211_CCMP_PN_LEN); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (WARN_ON(tid != 0)) return; pn = key->u.aes_cmac.rx_pn; memcpy(pn, seq->aes_cmac.pn, IEEE80211_CMAC_PN_LEN); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (WARN_ON(tid != 0)) return; pn = key->u.aes_gmac.rx_pn; memcpy(pn, seq->aes_gmac.pn, IEEE80211_GMAC_PN_LEN); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: if (WARN_ON(tid < -1 || tid >= IEEE80211_NUM_TIDS)) return; if (tid < 0) pn = key->u.gcmp.rx_pn[IEEE80211_NUM_TIDS]; else pn = key->u.gcmp.rx_pn[tid]; memcpy(pn, seq->gcmp.pn, IEEE80211_GCMP_PN_LEN); break; default: WARN_ON(1); break; } } EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq); void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) { struct ieee80211_key *key; key = container_of(keyconf, struct ieee80211_key, conf); lockdep_assert_wiphy(key->local->hw.wiphy); /* * if key was uploaded, we assume the driver will/has remove(d) * it, so adjust bookkeeping accordingly */ if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE | IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) increment_tailroom_need_count(key->sdata); } ieee80211_key_free(key, false); } EXPORT_SYMBOL_GPL(ieee80211_remove_key); struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, struct ieee80211_key_conf *keyconf) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_key *key; int err; if (WARN_ON(!local->wowlan)) return ERR_PTR(-EINVAL); if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return ERR_PTR(-EINVAL); key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx, keyconf->keylen, keyconf->key, 0, NULL); if (IS_ERR(key)) return ERR_CAST(key); if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; /* FIXME: this function needs to get a link ID */ err = ieee80211_key_link(key, &sdata->deflink, NULL); if (err) return ERR_PTR(err); return &key->conf; } EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_add); void ieee80211_key_mic_failure(struct ieee80211_key_conf *keyconf) { struct ieee80211_key *key; key = container_of(keyconf, struct ieee80211_key, conf); switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: key->u.aes_cmac.icverrors++; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: key->u.aes_gmac.icverrors++; break; default: /* ignore the others for now, we don't keep counters now */ break; } } EXPORT_SYMBOL_GPL(ieee80211_key_mic_failure); void ieee80211_key_replay(struct ieee80211_key_conf *keyconf) { struct ieee80211_key *key; key = container_of(keyconf, struct ieee80211_key, conf); switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: key->u.ccmp.replays++; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: key->u.aes_cmac.replays++; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: key->u.aes_gmac.replays++; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: key->u.gcmp.replays++; break; } } EXPORT_SYMBOL_GPL(ieee80211_key_replay); int ieee80211_key_switch_links(struct ieee80211_sub_if_data *sdata, unsigned long del_links_mask, unsigned long add_links_mask) { struct ieee80211_key *key; int ret; list_for_each_entry(key, &sdata->key_list, list) { if (key->conf.link_id < 0 || !(del_links_mask & BIT(key->conf.link_id))) continue; /* shouldn't happen for per-link keys */ WARN_ON(key->sta); ieee80211_key_disable_hw_accel(key); } list_for_each_entry(key, &sdata->key_list, list) { if (key->conf.link_id < 0 || !(add_links_mask & BIT(key->conf.link_id))) continue; /* shouldn't happen for per-link keys */ WARN_ON(key->sta); ret = ieee80211_key_enable_hw_accel(key); if (ret) return ret; } return 0; }
32 32 33 33 33 69 68 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Authors: Sjur Brendeland * Daniel Martensson */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if_ether.h> #include <linux/ip.h> #include <linux/sched.h> #include <linux/sockios.h> #include <linux/caif/if_caif.h> #include <net/rtnetlink.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> /* GPRS PDP connection has MTU to 1500 */ #define GPRS_PDP_MTU 1500 /* 5 sec. connect timeout */ #define CONNECT_TIMEOUT (5 * HZ) #define CAIF_NET_DEFAULT_QUEUE_LEN 500 #define UNDEF_CONNID 0xffffffff /*This list is protected by the rtnl lock. */ static LIST_HEAD(chnl_net_list); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("caif"); enum caif_states { CAIF_CONNECTED = 1, CAIF_CONNECTING, CAIF_DISCONNECTED, CAIF_SHUTDOWN }; struct chnl_net { struct cflayer chnl; struct caif_connect_request conn_req; struct list_head list_field; struct net_device *netdev; char name[256]; wait_queue_head_t netmgmt_wq; /* Flow status to remember and control the transmission. */ bool flowenabled; enum caif_states state; }; static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct sk_buff *skb; struct chnl_net *priv; int pktlen; const u8 *ip_version; u8 buf; priv = container_of(layr, struct chnl_net, chnl); skb = (struct sk_buff *) cfpkt_tonative(pkt); /* Get length of CAIF packet. */ pktlen = skb->len; /* Pass some minimum information and * send the packet to the net stack. */ skb->dev = priv->netdev; /* check the version of IP */ ip_version = skb_header_pointer(skb, 0, 1, &buf); if (!ip_version) { kfree_skb(skb); return -EINVAL; } switch (*ip_version >> 4) { case 4: skb->protocol = htons(ETH_P_IP); break; case 6: skb->protocol = htons(ETH_P_IPV6); break; default: kfree_skb(skb); priv->netdev->stats.rx_errors++; return -EINVAL; } /* If we change the header in loop mode, the checksum is corrupted. */ if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) skb->ip_summed = CHECKSUM_UNNECESSARY; else skb->ip_summed = CHECKSUM_NONE; netif_rx(skb); /* Update statistics. */ priv->netdev->stats.rx_packets++; priv->netdev->stats.rx_bytes += pktlen; return 0; } static int delete_device(struct chnl_net *dev) { ASSERT_RTNL(); if (dev->netdev) unregister_netdevice(dev->netdev); return 0; } static void close_work(struct work_struct *work) { struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); if (dev->state == CAIF_SHUTDOWN) dev_close(dev->netdev); } rtnl_unlock(); } static DECLARE_WORK(close_worker, close_work); static void chnl_hold(struct cflayer *lyr) { struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); dev_hold(priv->netdev); } static void chnl_put(struct cflayer *lyr) { struct chnl_net *priv = container_of(lyr, struct chnl_net, chnl); dev_put(priv->netdev); } static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); pr_debug("NET flowctrl func called flow: %s\n", flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" : flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" : flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" : flow == CAIF_CTRLCMD_DEINIT_RSP ? "CLOSE/DEINIT" : flow == CAIF_CTRLCMD_INIT_FAIL_RSP ? "OPEN_FAIL" : flow == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND ? "REMOTE_SHUTDOWN" : "UNKNOWN CTRL COMMAND"); switch (flow) { case CAIF_CTRLCMD_FLOW_OFF_IND: priv->flowenabled = false; netif_stop_queue(priv->netdev); break; case CAIF_CTRLCMD_DEINIT_RSP: priv->state = CAIF_DISCONNECTED; break; case CAIF_CTRLCMD_INIT_FAIL_RSP: priv->state = CAIF_DISCONNECTED; wake_up_interruptible(&priv->netmgmt_wq); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: priv->state = CAIF_SHUTDOWN; netif_tx_disable(priv->netdev); schedule_work(&close_worker); break; case CAIF_CTRLCMD_FLOW_ON_IND: priv->flowenabled = true; netif_wake_queue(priv->netdev); break; case CAIF_CTRLCMD_INIT_RSP: caif_client_register_refcnt(&priv->chnl, chnl_hold, chnl_put); priv->state = CAIF_CONNECTED; priv->flowenabled = true; netif_wake_queue(priv->netdev); wake_up_interruptible(&priv->netmgmt_wq); break; default: break; } } static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct chnl_net *priv; struct cfpkt *pkt = NULL; int len; int result = -1; /* Get our private data. */ priv = netdev_priv(dev); if (skb->len > priv->netdev->mtu) { pr_warn("Size of skb exceeded MTU\n"); kfree_skb(skb); dev->stats.tx_errors++; return NETDEV_TX_OK; } if (!priv->flowenabled) { pr_debug("dropping packets flow off\n"); kfree_skb(skb); dev->stats.tx_dropped++; return NETDEV_TX_OK; } if (priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP) swap(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* Store original SKB length. */ len = skb->len; pkt = cfpkt_fromnative(CAIF_DIR_OUT, (void *) skb); /* Send the packet down the stack. */ result = priv->chnl.dn->transmit(priv->chnl.dn, pkt); if (result) { dev->stats.tx_dropped++; return NETDEV_TX_OK; } /* Update statistics. */ dev->stats.tx_packets++; dev->stats.tx_bytes += len; return NETDEV_TX_OK; } static int chnl_net_open(struct net_device *dev) { struct chnl_net *priv = NULL; int result = -1; int llifindex, headroom, tailroom, mtu; struct net_device *lldev; ASSERT_RTNL(); priv = netdev_priv(dev); if (!priv) { pr_debug("chnl_net_open: no priv\n"); return -ENODEV; } if (priv->state != CAIF_CONNECTING) { priv->state = CAIF_CONNECTING; result = caif_connect_client(dev_net(dev), &priv->conn_req, &priv->chnl, &llifindex, &headroom, &tailroom); if (result != 0) { pr_debug("err: " "Unable to register and open device," " Err:%d\n", result); goto error; } lldev = __dev_get_by_index(dev_net(dev), llifindex); if (lldev == NULL) { pr_debug("no interface?\n"); result = -ENODEV; goto error; } dev->needed_tailroom = tailroom + lldev->needed_tailroom; dev->hard_header_len = headroom + lldev->hard_header_len + lldev->needed_tailroom; /* * MTU, head-room etc is not know before we have a * CAIF link layer device available. MTU calculation may * override initial RTNL configuration. * MTU is minimum of current mtu, link layer mtu pluss * CAIF head and tail, and PDP GPRS contexts max MTU. */ mtu = min_t(int, dev->mtu, lldev->mtu - (headroom + tailroom)); mtu = min_t(int, GPRS_PDP_MTU, mtu); dev_set_mtu(dev, mtu); if (mtu < 100) { pr_warn("CAIF Interface MTU too small (%d)\n", mtu); result = -ENODEV; goto error; } } rtnl_unlock(); /* Release RTNL lock during connect wait */ result = wait_event_interruptible_timeout(priv->netmgmt_wq, priv->state != CAIF_CONNECTING, CONNECT_TIMEOUT); rtnl_lock(); if (result == -ERESTARTSYS) { pr_debug("wait_event_interruptible woken by a signal\n"); result = -ERESTARTSYS; goto error; } if (result == 0) { pr_debug("connect timeout\n"); result = -ETIMEDOUT; goto error; } if (priv->state != CAIF_CONNECTED) { pr_debug("connect failed\n"); result = -ECONNREFUSED; goto error; } pr_debug("CAIF Netdevice connected\n"); return 0; error: caif_disconnect_client(dev_net(dev), &priv->chnl); priv->state = CAIF_DISCONNECTED; pr_debug("state disconnected\n"); return result; } static int chnl_net_stop(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); priv->state = CAIF_DISCONNECTED; caif_disconnect_client(dev_net(dev), &priv->chnl); return 0; } static int chnl_net_init(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); strncpy(priv->name, dev->name, sizeof(priv->name)); INIT_LIST_HEAD(&priv->list_field); return 0; } static void chnl_net_uninit(struct net_device *dev) { struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); list_del_init(&priv->list_field); } static const struct net_device_ops netdev_ops = { .ndo_open = chnl_net_open, .ndo_stop = chnl_net_stop, .ndo_init = chnl_net_init, .ndo_uninit = chnl_net_uninit, .ndo_start_xmit = chnl_net_start_xmit, }; static void chnl_net_destructor(struct net_device *dev) { struct chnl_net *priv = netdev_priv(dev); caif_free_client(&priv->chnl); } static void ipcaif_net_setup(struct net_device *dev) { struct chnl_net *priv; dev->netdev_ops = &netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = chnl_net_destructor; dev->flags |= IFF_NOARP; dev->flags |= IFF_POINTOPOINT; dev->mtu = GPRS_PDP_MTU; dev->tx_queue_len = CAIF_NET_DEFAULT_QUEUE_LEN; priv = netdev_priv(dev); priv->chnl.receive = chnl_recv_cb; priv->chnl.ctrlcmd = chnl_flowctrl_cb; priv->netdev = dev; priv->conn_req.protocol = CAIFPROTO_DATAGRAM; priv->conn_req.link_selector = CAIF_LINK_HIGH_BANDW; priv->conn_req.priority = CAIF_PRIO_LOW; /* Insert illegal value */ priv->conn_req.sockaddr.u.dgm.connection_id = UNDEF_CONNID; priv->flowenabled = false; init_waitqueue_head(&priv->netmgmt_wq); } static int ipcaif_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct chnl_net *priv; u8 loop; priv = netdev_priv(dev); if (nla_put_u32(skb, IFLA_CAIF_IPV4_CONNID, priv->conn_req.sockaddr.u.dgm.connection_id) || nla_put_u32(skb, IFLA_CAIF_IPV6_CONNID, priv->conn_req.sockaddr.u.dgm.connection_id)) goto nla_put_failure; loop = priv->conn_req.protocol == CAIFPROTO_DATAGRAM_LOOP; if (nla_put_u8(skb, IFLA_CAIF_LOOPBACK, loop)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static void caif_netlink_parms(struct nlattr *data[], struct caif_connect_request *conn_req) { if (!data) { pr_warn("no params data found\n"); return; } if (data[IFLA_CAIF_IPV4_CONNID]) conn_req->sockaddr.u.dgm.connection_id = nla_get_u32(data[IFLA_CAIF_IPV4_CONNID]); if (data[IFLA_CAIF_IPV6_CONNID]) conn_req->sockaddr.u.dgm.connection_id = nla_get_u32(data[IFLA_CAIF_IPV6_CONNID]); if (data[IFLA_CAIF_LOOPBACK]) { if (nla_get_u8(data[IFLA_CAIF_LOOPBACK])) conn_req->protocol = CAIFPROTO_DATAGRAM_LOOP; else conn_req->protocol = CAIFPROTO_DATAGRAM; } } static int ipcaif_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { int ret; struct chnl_net *caifdev; ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); ret = register_netdevice(dev); if (ret) pr_warn("device rtml registration failed\n"); else list_add(&caifdev->list_field, &chnl_net_list); /* Use ifindex as connection id, and use loopback channel default. */ if (caifdev->conn_req.sockaddr.u.dgm.connection_id == UNDEF_CONNID) { caifdev->conn_req.sockaddr.u.dgm.connection_id = dev->ifindex; caifdev->conn_req.protocol = CAIFPROTO_DATAGRAM_LOOP; } return ret; } static int ipcaif_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct chnl_net *caifdev; ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); netdev_state_change(dev); return 0; } static size_t ipcaif_get_size(const struct net_device *dev) { return /* IFLA_CAIF_IPV4_CONNID */ nla_total_size(4) + /* IFLA_CAIF_IPV6_CONNID */ nla_total_size(4) + /* IFLA_CAIF_LOOPBACK */ nla_total_size(2) + 0; } static const struct nla_policy ipcaif_policy[IFLA_CAIF_MAX + 1] = { [IFLA_CAIF_IPV4_CONNID] = { .type = NLA_U32 }, [IFLA_CAIF_IPV6_CONNID] = { .type = NLA_U32 }, [IFLA_CAIF_LOOPBACK] = { .type = NLA_U8 } }; static struct rtnl_link_ops ipcaif_link_ops __read_mostly = { .kind = "caif", .priv_size = sizeof(struct chnl_net), .setup = ipcaif_net_setup, .maxtype = IFLA_CAIF_MAX, .policy = ipcaif_policy, .newlink = ipcaif_newlink, .changelink = ipcaif_changelink, .get_size = ipcaif_get_size, .fill_info = ipcaif_fill_info, }; static int __init chnl_init_module(void) { return rtnl_link_register(&ipcaif_link_ops); } static void __exit chnl_exit_module(void) { struct chnl_net *dev = NULL; struct list_head *list_node; struct list_head *_tmp; rtnl_link_unregister(&ipcaif_link_ops); rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); list_del_init(list_node); delete_device(dev); } rtnl_unlock(); } module_init(chnl_init_module); module_exit(chnl_exit_module);
4 4 2 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 // SPDX-License-Identifier: GPL-2.0-only /* * v4l2-dv-timings - dv-timings helper functions * * Copyright 2013 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/rational.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <media/v4l2-dv-timings.h> #include <linux/math64.h> #include <linux/hdmi.h> #include <media/cec.h> MODULE_AUTHOR("Hans Verkuil"); MODULE_DESCRIPTION("V4L2 DV Timings Helper Functions"); MODULE_LICENSE("GPL"); const struct v4l2_dv_timings v4l2_dv_timings_presets[] = { V4L2_DV_BT_CEA_640X480P59_94, V4L2_DV_BT_CEA_720X480I59_94, V4L2_DV_BT_CEA_720X480P59_94, V4L2_DV_BT_CEA_720X576I50, V4L2_DV_BT_CEA_720X576P50, V4L2_DV_BT_CEA_1280X720P24, V4L2_DV_BT_CEA_1280X720P25, V4L2_DV_BT_CEA_1280X720P30, V4L2_DV_BT_CEA_1280X720P50, V4L2_DV_BT_CEA_1280X720P60, V4L2_DV_BT_CEA_1920X1080P24, V4L2_DV_BT_CEA_1920X1080P25, V4L2_DV_BT_CEA_1920X1080P30, V4L2_DV_BT_CEA_1920X1080I50, V4L2_DV_BT_CEA_1920X1080P50, V4L2_DV_BT_CEA_1920X1080I60, V4L2_DV_BT_CEA_1920X1080P60, V4L2_DV_BT_DMT_640X350P85, V4L2_DV_BT_DMT_640X400P85, V4L2_DV_BT_DMT_720X400P85, V4L2_DV_BT_DMT_640X480P72, V4L2_DV_BT_DMT_640X480P75, V4L2_DV_BT_DMT_640X480P85, V4L2_DV_BT_DMT_800X600P56, V4L2_DV_BT_DMT_800X600P60, V4L2_DV_BT_DMT_800X600P72, V4L2_DV_BT_DMT_800X600P75, V4L2_DV_BT_DMT_800X600P85, V4L2_DV_BT_DMT_800X600P120_RB, V4L2_DV_BT_DMT_848X480P60, V4L2_DV_BT_DMT_1024X768I43, V4L2_DV_BT_DMT_1024X768P60, V4L2_DV_BT_DMT_1024X768P70, V4L2_DV_BT_DMT_1024X768P75, V4L2_DV_BT_DMT_1024X768P85, V4L2_DV_BT_DMT_1024X768P120_RB, V4L2_DV_BT_DMT_1152X864P75, V4L2_DV_BT_DMT_1280X768P60_RB, V4L2_DV_BT_DMT_1280X768P60, V4L2_DV_BT_DMT_1280X768P75, V4L2_DV_BT_DMT_1280X768P85, V4L2_DV_BT_DMT_1280X768P120_RB, V4L2_DV_BT_DMT_1280X800P60_RB, V4L2_DV_BT_DMT_1280X800P60, V4L2_DV_BT_DMT_1280X800P75, V4L2_DV_BT_DMT_1280X800P85, V4L2_DV_BT_DMT_1280X800P120_RB, V4L2_DV_BT_DMT_1280X960P60, V4L2_DV_BT_DMT_1280X960P85, V4L2_DV_BT_DMT_1280X960P120_RB, V4L2_DV_BT_DMT_1280X1024P60, V4L2_DV_BT_DMT_1280X1024P75, V4L2_DV_BT_DMT_1280X1024P85, V4L2_DV_BT_DMT_1280X1024P120_RB, V4L2_DV_BT_DMT_1360X768P60, V4L2_DV_BT_DMT_1360X768P120_RB, V4L2_DV_BT_DMT_1366X768P60, V4L2_DV_BT_DMT_1366X768P60_RB, V4L2_DV_BT_DMT_1400X1050P60_RB, V4L2_DV_BT_DMT_1400X1050P60, V4L2_DV_BT_DMT_1400X1050P75, V4L2_DV_BT_DMT_1400X1050P85, V4L2_DV_BT_DMT_1400X1050P120_RB, V4L2_DV_BT_DMT_1440X900P60_RB, V4L2_DV_BT_DMT_1440X900P60, V4L2_DV_BT_DMT_1440X900P75, V4L2_DV_BT_DMT_1440X900P85, V4L2_DV_BT_DMT_1440X900P120_RB, V4L2_DV_BT_DMT_1600X900P60_RB, V4L2_DV_BT_DMT_1600X1200P60, V4L2_DV_BT_DMT_1600X1200P65, V4L2_DV_BT_DMT_1600X1200P70, V4L2_DV_BT_DMT_1600X1200P75, V4L2_DV_BT_DMT_1600X1200P85, V4L2_DV_BT_DMT_1600X1200P120_RB, V4L2_DV_BT_DMT_1680X1050P60_RB, V4L2_DV_BT_DMT_1680X1050P60, V4L2_DV_BT_DMT_1680X1050P75, V4L2_DV_BT_DMT_1680X1050P85, V4L2_DV_BT_DMT_1680X1050P120_RB, V4L2_DV_BT_DMT_1792X1344P60, V4L2_DV_BT_DMT_1792X1344P75, V4L2_DV_BT_DMT_1792X1344P120_RB, V4L2_DV_BT_DMT_1856X1392P60, V4L2_DV_BT_DMT_1856X1392P75, V4L2_DV_BT_DMT_1856X1392P120_RB, V4L2_DV_BT_DMT_1920X1200P60_RB, V4L2_DV_BT_DMT_1920X1200P60, V4L2_DV_BT_DMT_1920X1200P75, V4L2_DV_BT_DMT_1920X1200P85, V4L2_DV_BT_DMT_1920X1200P120_RB, V4L2_DV_BT_DMT_1920X1440P60, V4L2_DV_BT_DMT_1920X1440P75, V4L2_DV_BT_DMT_1920X1440P120_RB, V4L2_DV_BT_DMT_2048X1152P60_RB, V4L2_DV_BT_DMT_2560X1600P60_RB, V4L2_DV_BT_DMT_2560X1600P60, V4L2_DV_BT_DMT_2560X1600P75, V4L2_DV_BT_DMT_2560X1600P85, V4L2_DV_BT_DMT_2560X1600P120_RB, V4L2_DV_BT_CEA_3840X2160P24, V4L2_DV_BT_CEA_3840X2160P25, V4L2_DV_BT_CEA_3840X2160P30, V4L2_DV_BT_CEA_3840X2160P50, V4L2_DV_BT_CEA_3840X2160P60, V4L2_DV_BT_CEA_4096X2160P24, V4L2_DV_BT_CEA_4096X2160P25, V4L2_DV_BT_CEA_4096X2160P30, V4L2_DV_BT_CEA_4096X2160P50, V4L2_DV_BT_DMT_4096X2160P59_94_RB, V4L2_DV_BT_CEA_4096X2160P60, { } }; EXPORT_SYMBOL_GPL(v4l2_dv_timings_presets); bool v4l2_valid_dv_timings(const struct v4l2_dv_timings *t, const struct v4l2_dv_timings_cap *dvcap, v4l2_check_dv_timings_fnc fnc, void *fnc_handle) { const struct v4l2_bt_timings *bt = &t->bt; const struct v4l2_bt_timings_cap *cap = &dvcap->bt; u32 caps = cap->capabilities; const u32 max_vert = 10240; u32 max_hor = 3 * bt->width; if (t->type != V4L2_DV_BT_656_1120) return false; if (t->type != dvcap->type || bt->height < cap->min_height || bt->height > cap->max_height || bt->width < cap->min_width || bt->width > cap->max_width || bt->pixelclock < cap->min_pixelclock || bt->pixelclock > cap->max_pixelclock || (!(caps & V4L2_DV_BT_CAP_CUSTOM) && cap->standards && bt->standards && !(bt->standards & cap->standards)) || (bt->interlaced && !(caps & V4L2_DV_BT_CAP_INTERLACED)) || (!bt->interlaced && !(caps & V4L2_DV_BT_CAP_PROGRESSIVE))) return false; /* sanity checks for the blanking timings */ if (!bt->interlaced && (bt->il_vbackporch || bt->il_vsync || bt->il_vfrontporch)) return false; /* * Some video receivers cannot properly separate the frontporch, * backporch and sync values, and instead they only have the total * blanking. That can be assigned to any of these three fields. * So just check that none of these are way out of range. */ if (bt->hfrontporch > max_hor || bt->hsync > max_hor || bt->hbackporch > max_hor) return false; if (bt->vfrontporch > max_vert || bt->vsync > max_vert || bt->vbackporch > max_vert) return false; if (bt->interlaced && (bt->il_vfrontporch > max_vert || bt->il_vsync > max_vert || bt->il_vbackporch > max_vert)) return false; return fnc == NULL || fnc(t, fnc_handle); } EXPORT_SYMBOL_GPL(v4l2_valid_dv_timings); int v4l2_enum_dv_timings_cap(struct v4l2_enum_dv_timings *t, const struct v4l2_dv_timings_cap *cap, v4l2_check_dv_timings_fnc fnc, void *fnc_handle) { u32 i, idx; memset(t->reserved, 0, sizeof(t->reserved)); for (i = idx = 0; v4l2_dv_timings_presets[i].bt.width; i++) { if (v4l2_valid_dv_timings(v4l2_dv_timings_presets + i, cap, fnc, fnc_handle) && idx++ == t->index) { t->timings = v4l2_dv_timings_presets[i]; return 0; } } return -EINVAL; } EXPORT_SYMBOL_GPL(v4l2_enum_dv_timings_cap); bool v4l2_find_dv_timings_cap(struct v4l2_dv_timings *t, const struct v4l2_dv_timings_cap *cap, unsigned pclock_delta, v4l2_check_dv_timings_fnc fnc, void *fnc_handle) { int i; if (!v4l2_valid_dv_timings(t, cap, fnc, fnc_handle)) return false; for (i = 0; v4l2_dv_timings_presets[i].bt.width; i++) { if (v4l2_valid_dv_timings(v4l2_dv_timings_presets + i, cap, fnc, fnc_handle) && v4l2_match_dv_timings(t, v4l2_dv_timings_presets + i, pclock_delta, false)) { u32 flags = t->bt.flags & V4L2_DV_FL_REDUCED_FPS; *t = v4l2_dv_timings_presets[i]; if (can_reduce_fps(&t->bt)) t->bt.flags |= flags; return true; } } return false; } EXPORT_SYMBOL_GPL(v4l2_find_dv_timings_cap); bool v4l2_find_dv_timings_cea861_vic(struct v4l2_dv_timings *t, u8 vic) { unsigned int i; for (i = 0; v4l2_dv_timings_presets[i].bt.width; i++) { const struct v4l2_bt_timings *bt = &v4l2_dv_timings_presets[i].bt; if ((bt->flags & V4L2_DV_FL_HAS_CEA861_VIC) && bt->cea861_vic == vic) { *t = v4l2_dv_timings_presets[i]; return true; } } return false; } EXPORT_SYMBOL_GPL(v4l2_find_dv_timings_cea861_vic); /** * v4l2_match_dv_timings - check if two timings match * @t1: compare this v4l2_dv_timings struct... * @t2: with this struct. * @pclock_delta: the allowed pixelclock deviation. * @match_reduced_fps: if true, then fail if V4L2_DV_FL_REDUCED_FPS does not * match. * * Compare t1 with t2 with a given margin of error for the pixelclock. */ bool v4l2_match_dv_timings(const struct v4l2_dv_timings *t1, const struct v4l2_dv_timings *t2, unsigned pclock_delta, bool match_reduced_fps) { if (t1->type != t2->type || t1->type != V4L2_DV_BT_656_1120) return false; if (t1->bt.width == t2->bt.width && t1->bt.height == t2->bt.height && t1->bt.interlaced == t2->bt.interlaced && t1->bt.polarities == t2->bt.polarities && t1->bt.pixelclock >= t2->bt.pixelclock - pclock_delta && t1->bt.pixelclock <= t2->bt.pixelclock + pclock_delta && t1->bt.hfrontporch == t2->bt.hfrontporch && t1->bt.hsync == t2->bt.hsync && t1->bt.hbackporch == t2->bt.hbackporch && t1->bt.vfrontporch == t2->bt.vfrontporch && t1->bt.vsync == t2->bt.vsync && t1->bt.vbackporch == t2->bt.vbackporch && (!match_reduced_fps || (t1->bt.flags & V4L2_DV_FL_REDUCED_FPS) == (t2->bt.flags & V4L2_DV_FL_REDUCED_FPS)) && (!t1->bt.interlaced || (t1->bt.il_vfrontporch == t2->bt.il_vfrontporch && t1->bt.il_vsync == t2->bt.il_vsync && t1->bt.il_vbackporch == t2->bt.il_vbackporch))) return true; return false; } EXPORT_SYMBOL_GPL(v4l2_match_dv_timings); void v4l2_print_dv_timings(const char *dev_prefix, const char *prefix, const struct v4l2_dv_timings *t, bool detailed) { const struct v4l2_bt_timings *bt = &t->bt; u32 htot, vtot; u32 fps; if (t->type != V4L2_DV_BT_656_1120) return; htot = V4L2_DV_BT_FRAME_WIDTH(bt); vtot = V4L2_DV_BT_FRAME_HEIGHT(bt); if (bt->interlaced) vtot /= 2; fps = (htot * vtot) > 0 ? div_u64((100 * (u64)bt->pixelclock), (htot * vtot)) : 0; if (prefix == NULL) prefix = ""; pr_info("%s: %s%ux%u%s%u.%02u (%ux%u)\n", dev_prefix, prefix, bt->width, bt->height, bt->interlaced ? "i" : "p", fps / 100, fps % 100, htot, vtot); if (!detailed) return; pr_info("%s: horizontal: fp = %u, %ssync = %u, bp = %u\n", dev_prefix, bt->hfrontporch, (bt->polarities & V4L2_DV_HSYNC_POS_POL) ? "+" : "-", bt->hsync, bt->hbackporch); pr_info("%s: vertical: fp = %u, %ssync = %u, bp = %u\n", dev_prefix, bt->vfrontporch, (bt->polarities & V4L2_DV_VSYNC_POS_POL) ? "+" : "-", bt->vsync, bt->vbackporch); if (bt->interlaced) pr_info("%s: vertical bottom field: fp = %u, %ssync = %u, bp = %u\n", dev_prefix, bt->il_vfrontporch, (bt->polarities & V4L2_DV_VSYNC_POS_POL) ? "+" : "-", bt->il_vsync, bt->il_vbackporch); pr_info("%s: pixelclock: %llu\n", dev_prefix, bt->pixelclock); pr_info("%s: flags (0x%x):%s%s%s%s%s%s%s%s%s%s\n", dev_prefix, bt->flags, (bt->flags & V4L2_DV_FL_REDUCED_BLANKING) ? " REDUCED_BLANKING" : "", ((bt->flags & V4L2_DV_FL_REDUCED_BLANKING) && bt->vsync == 8) ? " (V2)" : "", (bt->flags & V4L2_DV_FL_CAN_REDUCE_FPS) ? " CAN_REDUCE_FPS" : "", (bt->flags & V4L2_DV_FL_REDUCED_FPS) ? " REDUCED_FPS" : "", (bt->flags & V4L2_DV_FL_HALF_LINE) ? " HALF_LINE" : "", (bt->flags & V4L2_DV_FL_IS_CE_VIDEO) ? " CE_VIDEO" : "", (bt->flags & V4L2_DV_FL_FIRST_FIELD_EXTRA_LINE) ? " FIRST_FIELD_EXTRA_LINE" : "", (bt->flags & V4L2_DV_FL_HAS_PICTURE_ASPECT) ? " HAS_PICTURE_ASPECT" : "", (bt->flags & V4L2_DV_FL_HAS_CEA861_VIC) ? " HAS_CEA861_VIC" : "", (bt->flags & V4L2_DV_FL_HAS_HDMI_VIC) ? " HAS_HDMI_VIC" : ""); pr_info("%s: standards (0x%x):%s%s%s%s%s\n", dev_prefix, bt->standards, (bt->standards & V4L2_DV_BT_STD_CEA861) ? " CEA" : "", (bt->standards & V4L2_DV_BT_STD_DMT) ? " DMT" : "", (bt->standards & V4L2_DV_BT_STD_CVT) ? " CVT" : "", (bt->standards & V4L2_DV_BT_STD_GTF) ? " GTF" : "", (bt->standards & V4L2_DV_BT_STD_SDI) ? " SDI" : ""); if (bt->flags & V4L2_DV_FL_HAS_PICTURE_ASPECT) pr_info("%s: picture aspect (hor:vert): %u:%u\n", dev_prefix, bt->picture_aspect.numerator, bt->picture_aspect.denominator); if (bt->flags & V4L2_DV_FL_HAS_CEA861_VIC) pr_info("%s: CEA-861 VIC: %u\n", dev_prefix, bt->cea861_vic); if (bt->flags & V4L2_DV_FL_HAS_HDMI_VIC) pr_info("%s: HDMI VIC: %u\n", dev_prefix, bt->hdmi_vic); } EXPORT_SYMBOL_GPL(v4l2_print_dv_timings); struct v4l2_fract v4l2_dv_timings_aspect_ratio(const struct v4l2_dv_timings *t) { struct v4l2_fract ratio = { 1, 1 }; unsigned long n, d; if (t->type != V4L2_DV_BT_656_1120) return ratio; if (!(t->bt.flags & V4L2_DV_FL_HAS_PICTURE_ASPECT)) return ratio; ratio.numerator = t->bt.width * t->bt.picture_aspect.denominator; ratio.denominator = t->bt.height * t->bt.picture_aspect.numerator; rational_best_approximation(ratio.numerator, ratio.denominator, ratio.numerator, ratio.denominator, &n, &d); ratio.numerator = n; ratio.denominator = d; return ratio; } EXPORT_SYMBOL_GPL(v4l2_dv_timings_aspect_ratio); /** v4l2_calc_timeperframe - helper function to calculate timeperframe based * v4l2_dv_timings fields. * @t - Timings for the video mode. * * Calculates the expected timeperframe using the pixel clock value and * horizontal/vertical measures. This means that v4l2_dv_timings structure * must be correctly and fully filled. */ struct v4l2_fract v4l2_calc_timeperframe(const struct v4l2_dv_timings *t) { const struct v4l2_bt_timings *bt = &t->bt; struct v4l2_fract fps_fract = { 1, 1 }; unsigned long n, d; u32 htot, vtot, fps; u64 pclk; if (t->type != V4L2_DV_BT_656_1120) return fps_fract; htot = V4L2_DV_BT_FRAME_WIDTH(bt); vtot = V4L2_DV_BT_FRAME_HEIGHT(bt); pclk = bt->pixelclock; if ((bt->flags & V4L2_DV_FL_CAN_DETECT_REDUCED_FPS) && (bt->flags & V4L2_DV_FL_REDUCED_FPS)) pclk = div_u64(pclk * 1000ULL, 1001); fps = (htot * vtot) > 0 ? div_u64((100 * pclk), (htot * vtot)) : 0; if (!fps) return fps_fract; rational_best_approximation(fps, 100, fps, 100, &n, &d); fps_fract.numerator = d; fps_fract.denominator = n; return fps_fract; } EXPORT_SYMBOL_GPL(v4l2_calc_timeperframe); /* * CVT defines * Based on Coordinated Video Timings Standard * version 1.1 September 10, 2003 */ #define CVT_PXL_CLK_GRAN 250000 /* pixel clock granularity */ #define CVT_PXL_CLK_GRAN_RB_V2 1000 /* granularity for reduced blanking v2*/ /* Normal blanking */ #define CVT_MIN_V_BPORCH 7 /* lines */ #define CVT_MIN_V_PORCH_RND 3 /* lines */ #define CVT_MIN_VSYNC_BP 550 /* min time of vsync + back porch (us) */ #define CVT_HSYNC_PERCENT 8 /* nominal hsync as percentage of line */ /* Normal blanking for CVT uses GTF to calculate horizontal blanking */ #define CVT_CELL_GRAN 8 /* character cell granularity */ #define CVT_M 600 /* blanking formula gradient */ #define CVT_C 40 /* blanking formula offset */ #define CVT_K 128 /* blanking formula scaling factor */ #define CVT_J 20 /* blanking formula scaling factor */ #define CVT_C_PRIME (((CVT_C - CVT_J) * CVT_K / 256) + CVT_J) #define CVT_M_PRIME (CVT_K * CVT_M / 256) /* Reduced Blanking */ #define CVT_RB_MIN_V_BPORCH 7 /* lines */ #define CVT_RB_V_FPORCH 3 /* lines */ #define CVT_RB_MIN_V_BLANK 460 /* us */ #define CVT_RB_H_SYNC 32 /* pixels */ #define CVT_RB_H_BLANK 160 /* pixels */ /* Reduce blanking Version 2 */ #define CVT_RB_V2_H_BLANK 80 /* pixels */ #define CVT_RB_MIN_V_FPORCH 3 /* lines */ #define CVT_RB_V2_MIN_V_FPORCH 1 /* lines */ #define CVT_RB_V_BPORCH 6 /* lines */ /** v4l2_detect_cvt - detect if the given timings follow the CVT standard * @frame_height - the total height of the frame (including blanking) in lines. * @hfreq - the horizontal frequency in Hz. * @vsync - the height of the vertical sync in lines. * @active_width - active width of image (does not include blanking). This * information is needed only in case of version 2 of reduced blanking. * In other cases, this parameter does not have any effect on timings. * @polarities - the horizontal and vertical polarities (same as struct * v4l2_bt_timings polarities). * @interlaced - if this flag is true, it indicates interlaced format * @fmt - the resulting timings. * * This function will attempt to detect if the given values correspond to a * valid CVT format. If so, then it will return true, and fmt will be filled * in with the found CVT timings. */ bool v4l2_detect_cvt(unsigned frame_height, unsigned hfreq, unsigned vsync, unsigned active_width, u32 polarities, bool interlaced, struct v4l2_dv_timings *fmt) { int v_fp, v_bp, h_fp, h_bp, hsync; int frame_width, image_height, image_width; bool reduced_blanking; bool rb_v2 = false; unsigned pix_clk; if (vsync < 4 || vsync > 8) return false; if (polarities == V4L2_DV_VSYNC_POS_POL) reduced_blanking = false; else if (polarities == V4L2_DV_HSYNC_POS_POL) reduced_blanking = true; else return false; if (reduced_blanking && vsync == 8) rb_v2 = true; if (rb_v2 && active_width == 0) return false; if (!rb_v2 && vsync > 7) return false; if (hfreq == 0) return false; /* Vertical */ if (reduced_blanking) { if (rb_v2) { v_bp = CVT_RB_V_BPORCH; v_fp = (CVT_RB_MIN_V_BLANK * hfreq) / 1000000 + 1; v_fp -= vsync + v_bp; if (v_fp < CVT_RB_V2_MIN_V_FPORCH) v_fp = CVT_RB_V2_MIN_V_FPORCH; } else { v_fp = CVT_RB_V_FPORCH; v_bp = (CVT_RB_MIN_V_BLANK * hfreq) / 1000000 + 1; v_bp -= vsync + v_fp; if (v_bp < CVT_RB_MIN_V_BPORCH) v_bp = CVT_RB_MIN_V_BPORCH; } } else { v_fp = CVT_MIN_V_PORCH_RND; v_bp = (CVT_MIN_VSYNC_BP * hfreq) / 1000000 + 1 - vsync; if (v_bp < CVT_MIN_V_BPORCH) v_bp = CVT_MIN_V_BPORCH; } if (interlaced) image_height = (frame_height - 2 * v_fp - 2 * vsync - 2 * v_bp) & ~0x1; else image_height = (frame_height - v_fp - vsync - v_bp + 1) & ~0x1; if (image_height < 0) return false; /* Aspect ratio based on vsync */ switch (vsync) { case 4: image_width = (image_height * 4) / 3; break; case 5: image_width = (image_height * 16) / 9; break; case 6: image_width = (image_height * 16) / 10; break; case 7: /* special case */ if (image_height == 1024) image_width = (image_height * 5) / 4; else if (image_height == 768) image_width = (image_height * 15) / 9; else return false; break; case 8: image_width = active_width; break; default: return false; } if (!rb_v2) image_width = image_width & ~7; /* Horizontal */ if (reduced_blanking) { int h_blank; int clk_gran; h_blank = rb_v2 ? CVT_RB_V2_H_BLANK : CVT_RB_H_BLANK; clk_gran = rb_v2 ? CVT_PXL_CLK_GRAN_RB_V2 : CVT_PXL_CLK_GRAN; pix_clk = (image_width + h_blank) * hfreq; pix_clk = (pix_clk / clk_gran) * clk_gran; h_bp = h_blank / 2; hsync = CVT_RB_H_SYNC; h_fp = h_blank - h_bp - hsync; frame_width = image_width + h_blank; } else { unsigned ideal_duty_cycle_per_myriad = 100 * CVT_C_PRIME - (CVT_M_PRIME * 100000) / hfreq; int h_blank; if (ideal_duty_cycle_per_myriad < 2000) ideal_duty_cycle_per_myriad = 2000; h_blank = image_width * ideal_duty_cycle_per_myriad / (10000 - ideal_duty_cycle_per_myriad); h_blank = (h_blank / (2 * CVT_CELL_GRAN)) * 2 * CVT_CELL_GRAN; pix_clk = (image_width + h_blank) * hfreq; pix_clk = (pix_clk / CVT_PXL_CLK_GRAN) * CVT_PXL_CLK_GRAN; h_bp = h_blank / 2; frame_width = image_width + h_blank; hsync = frame_width * CVT_HSYNC_PERCENT / 100; hsync = (hsync / CVT_CELL_GRAN) * CVT_CELL_GRAN; h_fp = h_blank - hsync - h_bp; } fmt->type = V4L2_DV_BT_656_1120; fmt->bt.polarities = polarities; fmt->bt.width = image_width; fmt->bt.height = image_height; fmt->bt.hfrontporch = h_fp; fmt->bt.vfrontporch = v_fp; fmt->bt.hsync = hsync; fmt->bt.vsync = vsync; fmt->bt.hbackporch = frame_width - image_width - h_fp - hsync; if (!interlaced) { fmt->bt.vbackporch = frame_height - image_height - v_fp - vsync; fmt->bt.interlaced = V4L2_DV_PROGRESSIVE; } else { fmt->bt.vbackporch = (frame_height - image_height - 2 * v_fp - 2 * vsync) / 2; fmt->bt.il_vbackporch = frame_height - image_height - 2 * v_fp - 2 * vsync - fmt->bt.vbackporch; fmt->bt.il_vfrontporch = v_fp; fmt->bt.il_vsync = vsync; fmt->bt.flags |= V4L2_DV_FL_HALF_LINE; fmt->bt.interlaced = V4L2_DV_INTERLACED; } fmt->bt.pixelclock = pix_clk; fmt->bt.standards = V4L2_DV_BT_STD_CVT; if (reduced_blanking) fmt->bt.flags |= V4L2_DV_FL_REDUCED_BLANKING; return true; } EXPORT_SYMBOL_GPL(v4l2_detect_cvt); /* * GTF defines * Based on Generalized Timing Formula Standard * Version 1.1 September 2, 1999 */ #define GTF_PXL_CLK_GRAN 250000 /* pixel clock granularity */ #define GTF_MIN_VSYNC_BP 550 /* min time of vsync + back porch (us) */ #define GTF_V_FP 1 /* vertical front porch (lines) */ #define GTF_CELL_GRAN 8 /* character cell granularity */ /* Default */ #define GTF_D_M 600 /* blanking formula gradient */ #define GTF_D_C 40 /* blanking formula offset */ #define GTF_D_K 128 /* blanking formula scaling factor */ #define GTF_D_J 20 /* blanking formula scaling factor */ #define GTF_D_C_PRIME ((((GTF_D_C - GTF_D_J) * GTF_D_K) / 256) + GTF_D_J) #define GTF_D_M_PRIME ((GTF_D_K * GTF_D_M) / 256) /* Secondary */ #define GTF_S_M 3600 /* blanking formula gradient */ #define GTF_S_C 40 /* blanking formula offset */ #define GTF_S_K 128 /* blanking formula scaling factor */ #define GTF_S_J 35 /* blanking formula scaling factor */ #define GTF_S_C_PRIME ((((GTF_S_C - GTF_S_J) * GTF_S_K) / 256) + GTF_S_J) #define GTF_S_M_PRIME ((GTF_S_K * GTF_S_M) / 256) /** v4l2_detect_gtf - detect if the given timings follow the GTF standard * @frame_height - the total height of the frame (including blanking) in lines. * @hfreq - the horizontal frequency in Hz. * @vsync - the height of the vertical sync in lines. * @polarities - the horizontal and vertical polarities (same as struct * v4l2_bt_timings polarities). * @interlaced - if this flag is true, it indicates interlaced format * @aspect - preferred aspect ratio. GTF has no method of determining the * aspect ratio in order to derive the image width from the * image height, so it has to be passed explicitly. Usually * the native screen aspect ratio is used for this. If it * is not filled in correctly, then 16:9 will be assumed. * @fmt - the resulting timings. * * This function will attempt to detect if the given values correspond to a * valid GTF format. If so, then it will return true, and fmt will be filled * in with the found GTF timings. */ bool v4l2_detect_gtf(unsigned frame_height, unsigned hfreq, unsigned vsync, u32 polarities, bool interlaced, struct v4l2_fract aspect, struct v4l2_dv_timings *fmt) { int pix_clk; int v_fp, v_bp, h_fp, hsync; int frame_width, image_height, image_width; bool default_gtf; int h_blank; if (vsync != 3) return false; if (polarities == V4L2_DV_VSYNC_POS_POL) default_gtf = true; else if (polarities == V4L2_DV_HSYNC_POS_POL) default_gtf = false; else return false; if (hfreq == 0) return false; /* Vertical */ v_fp = GTF_V_FP; v_bp = (GTF_MIN_VSYNC_BP * hfreq + 500000) / 1000000 - vsync; if (interlaced) image_height = (frame_height - 2 * v_fp - 2 * vsync - 2 * v_bp) & ~0x1; else image_height = (frame_height - v_fp - vsync - v_bp + 1) & ~0x1; if (image_height < 0) return false; if (aspect.numerator == 0 || aspect.denominator == 0) { aspect.numerator = 16; aspect.denominator = 9; } image_width = ((image_height * aspect.numerator) / aspect.denominator); image_width = (image_width + GTF_CELL_GRAN/2) & ~(GTF_CELL_GRAN - 1); /* Horizontal */ if (default_gtf) { u64 num; u32 den; num = ((image_width * GTF_D_C_PRIME * (u64)hfreq) - ((u64)image_width * GTF_D_M_PRIME * 1000)); den = (hfreq * (100 - GTF_D_C_PRIME) + GTF_D_M_PRIME * 1000) * (2 * GTF_CELL_GRAN); h_blank = div_u64((num + (den >> 1)), den); h_blank *= (2 * GTF_CELL_GRAN); } else { u64 num; u32 den; num = ((image_width * GTF_S_C_PRIME * (u64)hfreq) - ((u64)image_width * GTF_S_M_PRIME * 1000)); den = (hfreq * (100 - GTF_S_C_PRIME) + GTF_S_M_PRIME * 1000) * (2 * GTF_CELL_GRAN); h_blank = div_u64((num + (den >> 1)), den); h_blank *= (2 * GTF_CELL_GRAN); } frame_width = image_width + h_blank; pix_clk = (image_width + h_blank) * hfreq; pix_clk = pix_clk / GTF_PXL_CLK_GRAN * GTF_PXL_CLK_GRAN; hsync = (frame_width * 8 + 50) / 100; hsync = DIV_ROUND_CLOSEST(hsync, GTF_CELL_GRAN) * GTF_CELL_GRAN; h_fp = h_blank / 2 - hsync; fmt->type = V4L2_DV_BT_656_1120; fmt->bt.polarities = polarities; fmt->bt.width = image_width; fmt->bt.height = image_height; fmt->bt.hfrontporch = h_fp; fmt->bt.vfrontporch = v_fp; fmt->bt.hsync = hsync; fmt->bt.vsync = vsync; fmt->bt.hbackporch = frame_width - image_width - h_fp - hsync; if (!interlaced) { fmt->bt.vbackporch = frame_height - image_height - v_fp - vsync; fmt->bt.interlaced = V4L2_DV_PROGRESSIVE; } else { fmt->bt.vbackporch = (frame_height - image_height - 2 * v_fp - 2 * vsync) / 2; fmt->bt.il_vbackporch = frame_height - image_height - 2 * v_fp - 2 * vsync - fmt->bt.vbackporch; fmt->bt.il_vfrontporch = v_fp; fmt->bt.il_vsync = vsync; fmt->bt.flags |= V4L2_DV_FL_HALF_LINE; fmt->bt.interlaced = V4L2_DV_INTERLACED; } fmt->bt.pixelclock = pix_clk; fmt->bt.standards = V4L2_DV_BT_STD_GTF; if (!default_gtf) fmt->bt.flags |= V4L2_DV_FL_REDUCED_BLANKING; return true; } EXPORT_SYMBOL_GPL(v4l2_detect_gtf); /** v4l2_calc_aspect_ratio - calculate the aspect ratio based on bytes * 0x15 and 0x16 from the EDID. * @hor_landscape - byte 0x15 from the EDID. * @vert_portrait - byte 0x16 from the EDID. * * Determines the aspect ratio from the EDID. * See VESA Enhanced EDID standard, release A, rev 2, section 3.6.2: * "Horizontal and Vertical Screen Size or Aspect Ratio" */ struct v4l2_fract v4l2_calc_aspect_ratio(u8 hor_landscape, u8 vert_portrait) { struct v4l2_fract aspect = { 16, 9 }; u8 ratio; /* Nothing filled in, fallback to 16:9 */ if (!hor_landscape && !vert_portrait) return aspect; /* Both filled in, so they are interpreted as the screen size in cm */ if (hor_landscape && vert_portrait) { aspect.numerator = hor_landscape; aspect.denominator = vert_portrait; return aspect; } /* Only one is filled in, so interpret them as a ratio: (val + 99) / 100 */ ratio = hor_landscape | vert_portrait; /* Change some rounded values into the exact aspect ratio */ if (ratio == 79) { aspect.numerator = 16; aspect.denominator = 9; } else if (ratio == 34) { aspect.numerator = 4; aspect.denominator = 3; } else if (ratio == 68) { aspect.numerator = 15; aspect.denominator = 9; } else { aspect.numerator = hor_landscape + 99; aspect.denominator = 100; } if (hor_landscape) return aspect; /* The aspect ratio is for portrait, so swap numerator and denominator */ swap(aspect.denominator, aspect.numerator); return aspect; } EXPORT_SYMBOL_GPL(v4l2_calc_aspect_ratio); /** v4l2_hdmi_rx_colorimetry - determine HDMI colorimetry information * based on various InfoFrames. * @avi: the AVI InfoFrame * @hdmi: the HDMI Vendor InfoFrame, may be NULL * @height: the frame height * * Determines the HDMI colorimetry information, i.e. how the HDMI * pixel color data should be interpreted. * * Note that some of the newer features (DCI-P3, HDR) are not yet * implemented: the hdmi.h header needs to be updated to the HDMI 2.0 * and CTA-861-G standards. */ struct v4l2_hdmi_colorimetry v4l2_hdmi_rx_colorimetry(const struct hdmi_avi_infoframe *avi, const struct hdmi_vendor_infoframe *hdmi, unsigned int height) { struct v4l2_hdmi_colorimetry c = { V4L2_COLORSPACE_SRGB, V4L2_YCBCR_ENC_DEFAULT, V4L2_QUANTIZATION_FULL_RANGE, V4L2_XFER_FUNC_SRGB }; bool is_ce = avi->video_code || (hdmi && hdmi->vic); bool is_sdtv = height <= 576; bool default_is_lim_range_rgb = avi->video_code > 1; switch (avi->colorspace) { case HDMI_COLORSPACE_RGB: /* RGB pixel encoding */ switch (avi->colorimetry) { case HDMI_COLORIMETRY_EXTENDED: switch (avi->extended_colorimetry) { case HDMI_EXTENDED_COLORIMETRY_OPRGB: c.colorspace = V4L2_COLORSPACE_OPRGB; c.xfer_func = V4L2_XFER_FUNC_OPRGB; break; case HDMI_EXTENDED_COLORIMETRY_BT2020: c.colorspace = V4L2_COLORSPACE_BT2020; c.xfer_func = V4L2_XFER_FUNC_709; break; default: break; } break; default: break; } switch (avi->quantization_range) { case HDMI_QUANTIZATION_RANGE_LIMITED: c.quantization = V4L2_QUANTIZATION_LIM_RANGE; break; case HDMI_QUANTIZATION_RANGE_FULL: break; default: if (default_is_lim_range_rgb) c.quantization = V4L2_QUANTIZATION_LIM_RANGE; break; } break; default: /* YCbCr pixel encoding */ c.quantization = V4L2_QUANTIZATION_LIM_RANGE; switch (avi->colorimetry) { case HDMI_COLORIMETRY_NONE: if (!is_ce) break; if (is_sdtv) { c.colorspace = V4L2_COLORSPACE_SMPTE170M; c.ycbcr_enc = V4L2_YCBCR_ENC_601; } else { c.colorspace = V4L2_COLORSPACE_REC709; c.ycbcr_enc = V4L2_YCBCR_ENC_709; } c.xfer_func = V4L2_XFER_FUNC_709; break; case HDMI_COLORIMETRY_ITU_601: c.colorspace = V4L2_COLORSPACE_SMPTE170M; c.ycbcr_enc = V4L2_YCBCR_ENC_601; c.xfer_func = V4L2_XFER_FUNC_709; break; case HDMI_COLORIMETRY_ITU_709: c.colorspace = V4L2_COLORSPACE_REC709; c.ycbcr_enc = V4L2_YCBCR_ENC_709; c.xfer_func = V4L2_XFER_FUNC_709; break; case HDMI_COLORIMETRY_EXTENDED: switch (avi->extended_colorimetry) { case HDMI_EXTENDED_COLORIMETRY_XV_YCC_601: c.colorspace = V4L2_COLORSPACE_REC709; c.ycbcr_enc = V4L2_YCBCR_ENC_XV709; c.xfer_func = V4L2_XFER_FUNC_709; break; case HDMI_EXTENDED_COLORIMETRY_XV_YCC_709: c.colorspace = V4L2_COLORSPACE_REC709; c.ycbcr_enc = V4L2_YCBCR_ENC_XV601; c.xfer_func = V4L2_XFER_FUNC_709; break; case HDMI_EXTENDED_COLORIMETRY_S_YCC_601: c.colorspace = V4L2_COLORSPACE_SRGB; c.ycbcr_enc = V4L2_YCBCR_ENC_601; c.xfer_func = V4L2_XFER_FUNC_SRGB; break; case HDMI_EXTENDED_COLORIMETRY_OPYCC_601: c.colorspace = V4L2_COLORSPACE_OPRGB; c.ycbcr_enc = V4L2_YCBCR_ENC_601; c.xfer_func = V4L2_XFER_FUNC_OPRGB; break; case HDMI_EXTENDED_COLORIMETRY_BT2020: c.colorspace = V4L2_COLORSPACE_BT2020; c.ycbcr_enc = V4L2_YCBCR_ENC_BT2020; c.xfer_func = V4L2_XFER_FUNC_709; break; case HDMI_EXTENDED_COLORIMETRY_BT2020_CONST_LUM: c.colorspace = V4L2_COLORSPACE_BT2020; c.ycbcr_enc = V4L2_YCBCR_ENC_BT2020_CONST_LUM; c.xfer_func = V4L2_XFER_FUNC_709; break; default: /* fall back to ITU_709 */ c.colorspace = V4L2_COLORSPACE_REC709; c.ycbcr_enc = V4L2_YCBCR_ENC_709; c.xfer_func = V4L2_XFER_FUNC_709; break; } break; default: break; } /* * YCC Quantization Range signaling is more-or-less broken, * let's just ignore this. */ break; } return c; } EXPORT_SYMBOL_GPL(v4l2_hdmi_rx_colorimetry); /** * v4l2_get_edid_phys_addr() - find and return the physical address * * @edid: pointer to the EDID data * @size: size in bytes of the EDID data * @offset: If not %NULL then the location of the physical address * bytes in the EDID will be returned here. This is set to 0 * if there is no physical address found. * * Return: the physical address or CEC_PHYS_ADDR_INVALID if there is none. */ u16 v4l2_get_edid_phys_addr(const u8 *edid, unsigned int size, unsigned int *offset) { unsigned int loc = cec_get_edid_spa_location(edid, size); if (offset) *offset = loc; if (loc == 0) return CEC_PHYS_ADDR_INVALID; return (edid[loc] << 8) | edid[loc + 1]; } EXPORT_SYMBOL_GPL(v4l2_get_edid_phys_addr); /** * v4l2_set_edid_phys_addr() - find and set the physical address * * @edid: pointer to the EDID data * @size: size in bytes of the EDID data * @phys_addr: the new physical address * * This function finds the location of the physical address in the EDID * and fills in the given physical address and updates the checksum * at the end of the EDID block. It does nothing if the EDID doesn't * contain a physical address. */ void v4l2_set_edid_phys_addr(u8 *edid, unsigned int size, u16 phys_addr) { unsigned int loc = cec_get_edid_spa_location(edid, size); u8 sum = 0; unsigned int i; if (loc == 0) return; edid[loc] = phys_addr >> 8; edid[loc + 1] = phys_addr & 0xff; loc &= ~0x7f; /* update the checksum */ for (i = loc; i < loc + 127; i++) sum += edid[i]; edid[i] = 256 - sum; } EXPORT_SYMBOL_GPL(v4l2_set_edid_phys_addr); /** * v4l2_phys_addr_for_input() - calculate the PA for an input * * @phys_addr: the physical address of the parent * @input: the number of the input port, must be between 1 and 15 * * This function calculates a new physical address based on the input * port number. For example: * * PA = 0.0.0.0 and input = 2 becomes 2.0.0.0 * * PA = 3.0.0.0 and input = 1 becomes 3.1.0.0 * * PA = 3.2.1.0 and input = 5 becomes 3.2.1.5 * * PA = 3.2.1.3 and input = 5 becomes f.f.f.f since it maxed out the depth. * * Return: the new physical address or CEC_PHYS_ADDR_INVALID. */ u16 v4l2_phys_addr_for_input(u16 phys_addr, u8 input) { /* Check if input is sane */ if (WARN_ON(input == 0 || input > 0xf)) return CEC_PHYS_ADDR_INVALID; if (phys_addr == 0) return input << 12; if ((phys_addr & 0x0fff) == 0) return phys_addr | (input << 8); if ((phys_addr & 0x00ff) == 0) return phys_addr | (input << 4); if ((phys_addr & 0x000f) == 0) return phys_addr | input; /* * All nibbles are used so no valid physical addresses can be assigned * to the input. */ return CEC_PHYS_ADDR_INVALID; } EXPORT_SYMBOL_GPL(v4l2_phys_addr_for_input); /** * v4l2_phys_addr_validate() - validate a physical address from an EDID * * @phys_addr: the physical address to validate * @parent: if not %NULL, then this is filled with the parents PA. * @port: if not %NULL, then this is filled with the input port. * * This validates a physical address as read from an EDID. If the * PA is invalid (such as 1.0.1.0 since '0' is only allowed at the end), * then it will return -EINVAL. * * The parent PA is passed into %parent and the input port is passed into * %port. For example: * * PA = 0.0.0.0: has parent 0.0.0.0 and input port 0. * * PA = 1.0.0.0: has parent 0.0.0.0 and input port 1. * * PA = 3.2.0.0: has parent 3.0.0.0 and input port 2. * * PA = f.f.f.f: has parent f.f.f.f and input port 0. * * Return: 0 if the PA is valid, -EINVAL if not. */ int v4l2_phys_addr_validate(u16 phys_addr, u16 *parent, u16 *port) { int i; if (parent) *parent = phys_addr; if (port) *port = 0; if (phys_addr == CEC_PHYS_ADDR_INVALID) return 0; for (i = 0; i < 16; i += 4) if (phys_addr & (0xf << i)) break; if (i == 16) return 0; if (parent) *parent = phys_addr & (0xfff0 << i); if (port) *port = (phys_addr >> i) & 0xf; for (i += 4; i < 16; i += 4) if ((phys_addr & (0xf << i)) == 0) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(v4l2_phys_addr_validate);
19 11 19 15 17 11 17 4 3 40 32 30 1 40 8 20 24 28 29 29 30 30 30 30 30 6 30 30 18 14 16 29 30 16 1 1 156 152 134 55 149 134 2 153 132 155 54 155 124 150 128 121 44 129 129 128 129 129 1 129 123 126 125 126 126 125 121 125 125 127 54 123 72 123 125 36 37 37 37 37 37 37 37 121 121 121 121 120 121 120 120 121 120 4 122 138 27 1 26 3 27 4 5 4 4 14 5 6 6 35 7 39 17 41 41 1 44 3 2 39 38 41 41 4 38 26 19 13 21 39 31 31 4 9 26 21 14 16 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 // SPDX-License-Identifier: GPL-2.0-or-later /* * Digital Audio (PCM) abstract layer * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * Abramo Bagnara <abramo@alsa-project.org> */ #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/time.h> #include <linux/math64.h> #include <linux/export.h> #include <sound/core.h> #include <sound/control.h> #include <sound/tlv.h> #include <sound/info.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include <sound/timer.h> #include "pcm_local.h" #ifdef CONFIG_SND_PCM_XRUN_DEBUG #define CREATE_TRACE_POINTS #include "pcm_trace.h" #else #define trace_hwptr(substream, pos, in_interrupt) #define trace_xrun(substream) #define trace_hw_ptr_error(substream, reason) #define trace_applptr(substream, prev, curr) #endif static int fill_silence_frames(struct snd_pcm_substream *substream, snd_pcm_uframes_t off, snd_pcm_uframes_t frames); static inline void update_silence_vars(struct snd_pcm_runtime *runtime, snd_pcm_uframes_t ptr, snd_pcm_uframes_t new_ptr) { snd_pcm_sframes_t delta; delta = new_ptr - ptr; if (delta == 0) return; if (delta < 0) delta += runtime->boundary; if ((snd_pcm_uframes_t)delta < runtime->silence_filled) runtime->silence_filled -= delta; else runtime->silence_filled = 0; runtime->silence_start = new_ptr; } /* * fill ring buffer with silence * runtime->silence_start: starting pointer to silence area * runtime->silence_filled: size filled with silence * runtime->silence_threshold: threshold from application * runtime->silence_size: maximal size from application * * when runtime->silence_size >= runtime->boundary - fill processed area with silence immediately */ void snd_pcm_playback_silence(struct snd_pcm_substream *substream, snd_pcm_uframes_t new_hw_ptr) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_uframes_t frames, ofs, transfer; int err; if (runtime->silence_size < runtime->boundary) { snd_pcm_sframes_t noise_dist; snd_pcm_uframes_t appl_ptr = READ_ONCE(runtime->control->appl_ptr); update_silence_vars(runtime, runtime->silence_start, appl_ptr); /* initialization outside pointer updates */ if (new_hw_ptr == ULONG_MAX) new_hw_ptr = runtime->status->hw_ptr; /* get hw_avail with the boundary crossing */ noise_dist = appl_ptr - new_hw_ptr; if (noise_dist < 0) noise_dist += runtime->boundary; /* total noise distance */ noise_dist += runtime->silence_filled; if (noise_dist >= (snd_pcm_sframes_t) runtime->silence_threshold) return; frames = runtime->silence_threshold - noise_dist; if (frames > runtime->silence_size) frames = runtime->silence_size; } else { /* * This filling mode aims at free-running mode (used for example by dmix), * which doesn't update the application pointer. */ snd_pcm_uframes_t hw_ptr = runtime->status->hw_ptr; if (new_hw_ptr == ULONG_MAX) { /* * Initialization, fill the whole unused buffer with silence. * * Usually, this is entered while stopped, before data is queued, * so both pointers are expected to be zero. */ snd_pcm_sframes_t avail = runtime->control->appl_ptr - hw_ptr; if (avail < 0) avail += runtime->boundary; /* * In free-running mode, appl_ptr will be zero even while running, * so we end up with a huge number. There is no useful way to * handle this, so we just clear the whole buffer. */ runtime->silence_filled = avail > runtime->buffer_size ? 0 : avail; runtime->silence_start = hw_ptr; } else { /* Silence the just played area immediately */ update_silence_vars(runtime, hw_ptr, new_hw_ptr); } /* * In this mode, silence_filled actually includes the valid * sample data from the user. */ frames = runtime->buffer_size - runtime->silence_filled; } if (snd_BUG_ON(frames > runtime->buffer_size)) return; if (frames == 0) return; ofs = (runtime->silence_start + runtime->silence_filled) % runtime->buffer_size; do { transfer = ofs + frames > runtime->buffer_size ? runtime->buffer_size - ofs : frames; err = fill_silence_frames(substream, ofs, transfer); snd_BUG_ON(err < 0); runtime->silence_filled += transfer; frames -= transfer; ofs = 0; } while (frames > 0); snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE); } #ifdef CONFIG_SND_DEBUG void snd_pcm_debug_name(struct snd_pcm_substream *substream, char *name, size_t len) { snprintf(name, len, "pcmC%dD%d%c:%d", substream->pcm->card->number, substream->pcm->device, substream->stream ? 'c' : 'p', substream->number); } EXPORT_SYMBOL(snd_pcm_debug_name); #endif #define XRUN_DEBUG_BASIC (1<<0) #define XRUN_DEBUG_STACK (1<<1) /* dump also stack */ #define XRUN_DEBUG_JIFFIESCHECK (1<<2) /* do jiffies check */ #ifdef CONFIG_SND_PCM_XRUN_DEBUG #define xrun_debug(substream, mask) \ ((substream)->pstr->xrun_debug & (mask)) #else #define xrun_debug(substream, mask) 0 #endif #define dump_stack_on_xrun(substream) do { \ if (xrun_debug(substream, XRUN_DEBUG_STACK)) \ dump_stack(); \ } while (0) /* call with stream lock held */ void __snd_pcm_xrun(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; trace_xrun(substream); if (runtime->tstamp_mode == SNDRV_PCM_TSTAMP_ENABLE) { struct timespec64 tstamp; snd_pcm_gettime(runtime, &tstamp); runtime->status->tstamp.tv_sec = tstamp.tv_sec; runtime->status->tstamp.tv_nsec = tstamp.tv_nsec; } snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN); if (xrun_debug(substream, XRUN_DEBUG_BASIC)) { char name[16]; snd_pcm_debug_name(substream, name, sizeof(name)); pcm_warn(substream->pcm, "XRUN: %s\n", name); dump_stack_on_xrun(substream); } } #ifdef CONFIG_SND_PCM_XRUN_DEBUG #define hw_ptr_error(substream, in_interrupt, reason, fmt, args...) \ do { \ trace_hw_ptr_error(substream, reason); \ if (xrun_debug(substream, XRUN_DEBUG_BASIC)) { \ pr_err_ratelimited("ALSA: PCM: [%c] " reason ": " fmt, \ (in_interrupt) ? 'Q' : 'P', ##args); \ dump_stack_on_xrun(substream); \ } \ } while (0) #else /* ! CONFIG_SND_PCM_XRUN_DEBUG */ #define hw_ptr_error(substream, fmt, args...) do { } while (0) #endif int snd_pcm_update_state(struct snd_pcm_substream *substream, struct snd_pcm_runtime *runtime) { snd_pcm_uframes_t avail; avail = snd_pcm_avail(substream); if (avail > runtime->avail_max) runtime->avail_max = avail; if (runtime->state == SNDRV_PCM_STATE_DRAINING) { if (avail >= runtime->buffer_size) { snd_pcm_drain_done(substream); return -EPIPE; } } else { if (avail >= runtime->stop_threshold) { __snd_pcm_xrun(substream); return -EPIPE; } } if (runtime->twake) { if (avail >= runtime->twake) wake_up(&runtime->tsleep); } else if (avail >= runtime->control->avail_min) wake_up(&runtime->sleep); return 0; } static void update_audio_tstamp(struct snd_pcm_substream *substream, struct timespec64 *curr_tstamp, struct timespec64 *audio_tstamp) { struct snd_pcm_runtime *runtime = substream->runtime; u64 audio_frames, audio_nsecs; struct timespec64 driver_tstamp; if (runtime->tstamp_mode != SNDRV_PCM_TSTAMP_ENABLE) return; if (!(substream->ops->get_time_info) || (runtime->audio_tstamp_report.actual_type == SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT)) { /* * provide audio timestamp derived from pointer position * add delay only if requested */ audio_frames = runtime->hw_ptr_wrap + runtime->status->hw_ptr; if (runtime->audio_tstamp_config.report_delay) { if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) audio_frames -= runtime->delay; else audio_frames += runtime->delay; } audio_nsecs = div_u64(audio_frames * 1000000000LL, runtime->rate); *audio_tstamp = ns_to_timespec64(audio_nsecs); } if (runtime->status->audio_tstamp.tv_sec != audio_tstamp->tv_sec || runtime->status->audio_tstamp.tv_nsec != audio_tstamp->tv_nsec) { runtime->status->audio_tstamp.tv_sec = audio_tstamp->tv_sec; runtime->status->audio_tstamp.tv_nsec = audio_tstamp->tv_nsec; runtime->status->tstamp.tv_sec = curr_tstamp->tv_sec; runtime->status->tstamp.tv_nsec = curr_tstamp->tv_nsec; } /* * re-take a driver timestamp to let apps detect if the reference tstamp * read by low-level hardware was provided with a delay */ snd_pcm_gettime(substream->runtime, &driver_tstamp); runtime->driver_tstamp = driver_tstamp; } static int snd_pcm_update_hw_ptr0(struct snd_pcm_substream *substream, unsigned int in_interrupt) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_uframes_t pos; snd_pcm_uframes_t old_hw_ptr, new_hw_ptr, hw_base; snd_pcm_sframes_t hdelta, delta; unsigned long jdelta; unsigned long curr_jiffies; struct timespec64 curr_tstamp; struct timespec64 audio_tstamp; int crossed_boundary = 0; old_hw_ptr = runtime->status->hw_ptr; /* * group pointer, time and jiffies reads to allow for more * accurate correlations/corrections. * The values are stored at the end of this routine after * corrections for hw_ptr position */ pos = substream->ops->pointer(substream); curr_jiffies = jiffies; if (runtime->tstamp_mode == SNDRV_PCM_TSTAMP_ENABLE) { if ((substream->ops->get_time_info) && (runtime->audio_tstamp_config.type_requested != SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT)) { substream->ops->get_time_info(substream, &curr_tstamp, &audio_tstamp, &runtime->audio_tstamp_config, &runtime->audio_tstamp_report); /* re-test in case tstamp type is not supported in hardware and was demoted to DEFAULT */ if (runtime->audio_tstamp_report.actual_type == SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT) snd_pcm_gettime(runtime, &curr_tstamp); } else snd_pcm_gettime(runtime, &curr_tstamp); } if (pos == SNDRV_PCM_POS_XRUN) { __snd_pcm_xrun(substream); return -EPIPE; } if (pos >= runtime->buffer_size) { if (printk_ratelimit()) { char name[16]; snd_pcm_debug_name(substream, name, sizeof(name)); pcm_err(substream->pcm, "invalid position: %s, pos = %ld, buffer size = %ld, period size = %ld\n", name, pos, runtime->buffer_size, runtime->period_size); } pos = 0; } pos -= pos % runtime->min_align; trace_hwptr(substream, pos, in_interrupt); hw_base = runtime->hw_ptr_base; new_hw_ptr = hw_base + pos; if (in_interrupt) { /* we know that one period was processed */ /* delta = "expected next hw_ptr" for in_interrupt != 0 */ delta = runtime->hw_ptr_interrupt + runtime->period_size; if (delta > new_hw_ptr) { /* check for double acknowledged interrupts */ hdelta = curr_jiffies - runtime->hw_ptr_jiffies; if (hdelta > runtime->hw_ptr_buffer_jiffies/2 + 1) { hw_base += runtime->buffer_size; if (hw_base >= runtime->boundary) { hw_base = 0; crossed_boundary++; } new_hw_ptr = hw_base + pos; goto __delta; } } } /* new_hw_ptr might be lower than old_hw_ptr in case when */ /* pointer crosses the end of the ring buffer */ if (new_hw_ptr < old_hw_ptr) { hw_base += runtime->buffer_size; if (hw_base >= runtime->boundary) { hw_base = 0; crossed_boundary++; } new_hw_ptr = hw_base + pos; } __delta: delta = new_hw_ptr - old_hw_ptr; if (delta < 0) delta += runtime->boundary; if (runtime->no_period_wakeup) { snd_pcm_sframes_t xrun_threshold; /* * Without regular period interrupts, we have to check * the elapsed time to detect xruns. */ jdelta = curr_jiffies - runtime->hw_ptr_jiffies; if (jdelta < runtime->hw_ptr_buffer_jiffies / 2) goto no_delta_check; hdelta = jdelta - delta * HZ / runtime->rate; xrun_threshold = runtime->hw_ptr_buffer_jiffies / 2 + 1; while (hdelta > xrun_threshold) { delta += runtime->buffer_size; hw_base += runtime->buffer_size; if (hw_base >= runtime->boundary) { hw_base = 0; crossed_boundary++; } new_hw_ptr = hw_base + pos; hdelta -= runtime->hw_ptr_buffer_jiffies; } goto no_delta_check; } /* something must be really wrong */ if (delta >= runtime->buffer_size + runtime->period_size) { hw_ptr_error(substream, in_interrupt, "Unexpected hw_ptr", "(stream=%i, pos=%ld, new_hw_ptr=%ld, old_hw_ptr=%ld)\n", substream->stream, (long)pos, (long)new_hw_ptr, (long)old_hw_ptr); return 0; } /* Do jiffies check only in xrun_debug mode */ if (!xrun_debug(substream, XRUN_DEBUG_JIFFIESCHECK)) goto no_jiffies_check; /* Skip the jiffies check for hardwares with BATCH flag. * Such hardware usually just increases the position at each IRQ, * thus it can't give any strange position. */ if (runtime->hw.info & SNDRV_PCM_INFO_BATCH) goto no_jiffies_check; hdelta = delta; if (hdelta < runtime->delay) goto no_jiffies_check; hdelta -= runtime->delay; jdelta = curr_jiffies - runtime->hw_ptr_jiffies; if (((hdelta * HZ) / runtime->rate) > jdelta + HZ/100) { delta = jdelta / (((runtime->period_size * HZ) / runtime->rate) + HZ/100); /* move new_hw_ptr according jiffies not pos variable */ new_hw_ptr = old_hw_ptr; hw_base = delta; /* use loop to avoid checks for delta overflows */ /* the delta value is small or zero in most cases */ while (delta > 0) { new_hw_ptr += runtime->period_size; if (new_hw_ptr >= runtime->boundary) { new_hw_ptr -= runtime->boundary; crossed_boundary--; } delta--; } /* align hw_base to buffer_size */ hw_ptr_error(substream, in_interrupt, "hw_ptr skipping", "(pos=%ld, delta=%ld, period=%ld, jdelta=%lu/%lu/%lu, hw_ptr=%ld/%ld)\n", (long)pos, (long)hdelta, (long)runtime->period_size, jdelta, ((hdelta * HZ) / runtime->rate), hw_base, (unsigned long)old_hw_ptr, (unsigned long)new_hw_ptr); /* reset values to proper state */ delta = 0; hw_base = new_hw_ptr - (new_hw_ptr % runtime->buffer_size); } no_jiffies_check: if (delta > runtime->period_size + runtime->period_size / 2) { hw_ptr_error(substream, in_interrupt, "Lost interrupts?", "(stream=%i, delta=%ld, new_hw_ptr=%ld, old_hw_ptr=%ld)\n", substream->stream, (long)delta, (long)new_hw_ptr, (long)old_hw_ptr); } no_delta_check: if (runtime->status->hw_ptr == new_hw_ptr) { runtime->hw_ptr_jiffies = curr_jiffies; update_audio_tstamp(substream, &curr_tstamp, &audio_tstamp); return 0; } if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && runtime->silence_size > 0) snd_pcm_playback_silence(substream, new_hw_ptr); if (in_interrupt) { delta = new_hw_ptr - runtime->hw_ptr_interrupt; if (delta < 0) delta += runtime->boundary; delta -= (snd_pcm_uframes_t)delta % runtime->period_size; runtime->hw_ptr_interrupt += delta; if (runtime->hw_ptr_interrupt >= runtime->boundary) runtime->hw_ptr_interrupt -= runtime->boundary; } runtime->hw_ptr_base = hw_base; runtime->status->hw_ptr = new_hw_ptr; runtime->hw_ptr_jiffies = curr_jiffies; if (crossed_boundary) { snd_BUG_ON(crossed_boundary != 1); runtime->hw_ptr_wrap += runtime->boundary; } update_audio_tstamp(substream, &curr_tstamp, &audio_tstamp); return snd_pcm_update_state(substream, runtime); } /* CAUTION: call it with irq disabled */ int snd_pcm_update_hw_ptr(struct snd_pcm_substream *substream) { return snd_pcm_update_hw_ptr0(substream, 0); } /** * snd_pcm_set_ops - set the PCM operators * @pcm: the pcm instance * @direction: stream direction, SNDRV_PCM_STREAM_XXX * @ops: the operator table * * Sets the given PCM operators to the pcm instance. */ void snd_pcm_set_ops(struct snd_pcm *pcm, int direction, const struct snd_pcm_ops *ops) { struct snd_pcm_str *stream = &pcm->streams[direction]; struct snd_pcm_substream *substream; for (substream = stream->substream; substream != NULL; substream = substream->next) substream->ops = ops; } EXPORT_SYMBOL(snd_pcm_set_ops); /** * snd_pcm_set_sync - set the PCM sync id * @substream: the pcm substream * * Sets the PCM sync identifier for the card. */ void snd_pcm_set_sync(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; runtime->sync.id32[0] = substream->pcm->card->number; runtime->sync.id32[1] = -1; runtime->sync.id32[2] = -1; runtime->sync.id32[3] = -1; } EXPORT_SYMBOL(snd_pcm_set_sync); /* * Standard ioctl routine */ static inline unsigned int div32(unsigned int a, unsigned int b, unsigned int *r) { if (b == 0) { *r = 0; return UINT_MAX; } *r = a % b; return a / b; } static inline unsigned int div_down(unsigned int a, unsigned int b) { if (b == 0) return UINT_MAX; return a / b; } static inline unsigned int div_up(unsigned int a, unsigned int b) { unsigned int r; unsigned int q; if (b == 0) return UINT_MAX; q = div32(a, b, &r); if (r) ++q; return q; } static inline unsigned int mul(unsigned int a, unsigned int b) { if (a == 0) return 0; if (div_down(UINT_MAX, a) < b) return UINT_MAX; return a * b; } static inline unsigned int muldiv32(unsigned int a, unsigned int b, unsigned int c, unsigned int *r) { u_int64_t n = (u_int64_t) a * b; if (c == 0) { *r = 0; return UINT_MAX; } n = div_u64_rem(n, c, r); if (n >= UINT_MAX) { *r = 0; return UINT_MAX; } return n; } /** * snd_interval_refine - refine the interval value of configurator * @i: the interval value to refine * @v: the interval value to refer to * * Refines the interval value with the reference value. * The interval is changed to the range satisfying both intervals. * The interval status (min, max, integer, etc.) are evaluated. * * Return: Positive if the value is changed, zero if it's not changed, or a * negative error code. */ int snd_interval_refine(struct snd_interval *i, const struct snd_interval *v) { int changed = 0; if (snd_BUG_ON(snd_interval_empty(i))) return -EINVAL; if (i->min < v->min) { i->min = v->min; i->openmin = v->openmin; changed = 1; } else if (i->min == v->min && !i->openmin && v->openmin) { i->openmin = 1; changed = 1; } if (i->max > v->max) { i->max = v->max; i->openmax = v->openmax; changed = 1; } else if (i->max == v->max && !i->openmax && v->openmax) { i->openmax = 1; changed = 1; } if (!i->integer && v->integer) { i->integer = 1; changed = 1; } if (i->integer) { if (i->openmin) { i->min++; i->openmin = 0; } if (i->openmax) { i->max--; i->openmax = 0; } } else if (!i->openmin && !i->openmax && i->min == i->max) i->integer = 1; if (snd_interval_checkempty(i)) { snd_interval_none(i); return -EINVAL; } return changed; } EXPORT_SYMBOL(snd_interval_refine); static int snd_interval_refine_first(struct snd_interval *i) { const unsigned int last_max = i->max; if (snd_BUG_ON(snd_interval_empty(i))) return -EINVAL; if (snd_interval_single(i)) return 0; i->max = i->min; if (i->openmin) i->max++; /* only exclude max value if also excluded before refine */ i->openmax = (i->openmax && i->max >= last_max); return 1; } static int snd_interval_refine_last(struct snd_interval *i) { const unsigned int last_min = i->min; if (snd_BUG_ON(snd_interval_empty(i))) return -EINVAL; if (snd_interval_single(i)) return 0; i->min = i->max; if (i->openmax) i->min--; /* only exclude min value if also excluded before refine */ i->openmin = (i->openmin && i->min <= last_min); return 1; } void snd_interval_mul(const struct snd_interval *a, const struct snd_interval *b, struct snd_interval *c) { if (a->empty || b->empty) { snd_interval_none(c); return; } c->empty = 0; c->min = mul(a->min, b->min); c->openmin = (a->openmin || b->openmin); c->max = mul(a->max, b->max); c->openmax = (a->openmax || b->openmax); c->integer = (a->integer && b->integer); } /** * snd_interval_div - refine the interval value with division * @a: dividend * @b: divisor * @c: quotient * * c = a / b * * Returns non-zero if the value is changed, zero if not changed. */ void snd_interval_div(const struct snd_interval *a, const struct snd_interval *b, struct snd_interval *c) { unsigned int r; if (a->empty || b->empty) { snd_interval_none(c); return; } c->empty = 0; c->min = div32(a->min, b->max, &r); c->openmin = (r || a->openmin || b->openmax); if (b->min > 0) { c->max = div32(a->max, b->min, &r); if (r) { c->max++; c->openmax = 1; } else c->openmax = (a->openmax || b->openmin); } else { c->max = UINT_MAX; c->openmax = 0; } c->integer = 0; } /** * snd_interval_muldivk - refine the interval value * @a: dividend 1 * @b: dividend 2 * @k: divisor (as integer) * @c: result * * c = a * b / k * * Returns non-zero if the value is changed, zero if not changed. */ void snd_interval_muldivk(const struct snd_interval *a, const struct snd_interval *b, unsigned int k, struct snd_interval *c) { unsigned int r; if (a->empty || b->empty) { snd_interval_none(c); return; } c->empty = 0; c->min = muldiv32(a->min, b->min, k, &r); c->openmin = (r || a->openmin || b->openmin); c->max = muldiv32(a->max, b->max, k, &r); if (r) { c->max++; c->openmax = 1; } else c->openmax = (a->openmax || b->openmax); c->integer = 0; } /** * snd_interval_mulkdiv - refine the interval value * @a: dividend 1 * @k: dividend 2 (as integer) * @b: divisor * @c: result * * c = a * k / b * * Returns non-zero if the value is changed, zero if not changed. */ void snd_interval_mulkdiv(const struct snd_interval *a, unsigned int k, const struct snd_interval *b, struct snd_interval *c) { unsigned int r; if (a->empty || b->empty) { snd_interval_none(c); return; } c->empty = 0; c->min = muldiv32(a->min, k, b->max, &r); c->openmin = (r || a->openmin || b->openmax); if (b->min > 0) { c->max = muldiv32(a->max, k, b->min, &r); if (r) { c->max++; c->openmax = 1; } else c->openmax = (a->openmax || b->openmin); } else { c->max = UINT_MAX; c->openmax = 0; } c->integer = 0; } /* ---- */ /** * snd_interval_ratnum - refine the interval value * @i: interval to refine * @rats_count: number of ratnum_t * @rats: ratnum_t array * @nump: pointer to store the resultant numerator * @denp: pointer to store the resultant denominator * * Return: Positive if the value is changed, zero if it's not changed, or a * negative error code. */ int snd_interval_ratnum(struct snd_interval *i, unsigned int rats_count, const struct snd_ratnum *rats, unsigned int *nump, unsigned int *denp) { unsigned int best_num, best_den; int best_diff; unsigned int k; struct snd_interval t; int err; unsigned int result_num, result_den; int result_diff; best_num = best_den = best_diff = 0; for (k = 0; k < rats_count; ++k) { unsigned int num = rats[k].num; unsigned int den; unsigned int q = i->min; int diff; if (q == 0) q = 1; den = div_up(num, q); if (den < rats[k].den_min) continue; if (den > rats[k].den_max) den = rats[k].den_max; else { unsigned int r; r = (den - rats[k].den_min) % rats[k].den_step; if (r != 0) den -= r; } diff = num - q * den; if (diff < 0) diff = -diff; if (best_num == 0 || diff * best_den < best_diff * den) { best_diff = diff; best_den = den; best_num = num; } } if (best_den == 0) { i->empty = 1; return -EINVAL; } t.min = div_down(best_num, best_den); t.openmin = !!(best_num % best_den); result_num = best_num; result_diff = best_diff; result_den = best_den; best_num = best_den = best_diff = 0; for (k = 0; k < rats_count; ++k) { unsigned int num = rats[k].num; unsigned int den; unsigned int q = i->max; int diff; if (q == 0) { i->empty = 1; return -EINVAL; } den = div_down(num, q); if (den > rats[k].den_max) continue; if (den < rats[k].den_min) den = rats[k].den_min; else { unsigned int r; r = (den - rats[k].den_min) % rats[k].den_step; if (r != 0) den += rats[k].den_step - r; } diff = q * den - num; if (diff < 0) diff = -diff; if (best_num == 0 || diff * best_den < best_diff * den) { best_diff = diff; best_den = den; best_num = num; } } if (best_den == 0) { i->empty = 1; return -EINVAL; } t.max = div_up(best_num, best_den); t.openmax = !!(best_num % best_den); t.integer = 0; err = snd_interval_refine(i, &t); if (err < 0) return err; if (snd_interval_single(i)) { if (best_diff * result_den < result_diff * best_den) { result_num = best_num; result_den = best_den; } if (nump) *nump = result_num; if (denp) *denp = result_den; } return err; } EXPORT_SYMBOL(snd_interval_ratnum); /** * snd_interval_ratden - refine the interval value * @i: interval to refine * @rats_count: number of struct ratden * @rats: struct ratden array * @nump: pointer to store the resultant numerator * @denp: pointer to store the resultant denominator * * Return: Positive if the value is changed, zero if it's not changed, or a * negative error code. */ static int snd_interval_ratden(struct snd_interval *i, unsigned int rats_count, const struct snd_ratden *rats, unsigned int *nump, unsigned int *denp) { unsigned int best_num, best_diff, best_den; unsigned int k; struct snd_interval t; int err; best_num = best_den = best_diff = 0; for (k = 0; k < rats_count; ++k) { unsigned int num; unsigned int den = rats[k].den; unsigned int q = i->min; int diff; num = mul(q, den); if (num > rats[k].num_max) continue; if (num < rats[k].num_min) num = rats[k].num_max; else { unsigned int r; r = (num - rats[k].num_min) % rats[k].num_step; if (r != 0) num += rats[k].num_step - r; } diff = num - q * den; if (best_num == 0 || diff * best_den < best_diff * den) { best_diff = diff; best_den = den; best_num = num; } } if (best_den == 0) { i->empty = 1; return -EINVAL; } t.min = div_down(best_num, best_den); t.openmin = !!(best_num % best_den); best_num = best_den = best_diff = 0; for (k = 0; k < rats_count; ++k) { unsigned int num; unsigned int den = rats[k].den; unsigned int q = i->max; int diff; num = mul(q, den); if (num < rats[k].num_min) continue; if (num > rats[k].num_max) num = rats[k].num_max; else { unsigned int r; r = (num - rats[k].num_min) % rats[k].num_step; if (r != 0) num -= r; } diff = q * den - num; if (best_num == 0 || diff * best_den < best_diff * den) { best_diff = diff; best_den = den; best_num = num; } } if (best_den == 0) { i->empty = 1; return -EINVAL; } t.max = div_up(best_num, best_den); t.openmax = !!(best_num % best_den); t.integer = 0; err = snd_interval_refine(i, &t); if (err < 0) return err; if (snd_interval_single(i)) { if (nump) *nump = best_num; if (denp) *denp = best_den; } return err; } /** * snd_interval_list - refine the interval value from the list * @i: the interval value to refine * @count: the number of elements in the list * @list: the value list * @mask: the bit-mask to evaluate * * Refines the interval value from the list. * When mask is non-zero, only the elements corresponding to bit 1 are * evaluated. * * Return: Positive if the value is changed, zero if it's not changed, or a * negative error code. */ int snd_interval_list(struct snd_interval *i, unsigned int count, const unsigned int *list, unsigned int mask) { unsigned int k; struct snd_interval list_range; if (!count) { i->empty = 1; return -EINVAL; } snd_interval_any(&list_range); list_range.min = UINT_MAX; list_range.max = 0; for (k = 0; k < count; k++) { if (mask && !(mask & (1 << k))) continue; if (!snd_interval_test(i, list[k])) continue; list_range.min = min(list_range.min, list[k]); list_range.max = max(list_range.max, list[k]); } return snd_interval_refine(i, &list_range); } EXPORT_SYMBOL(snd_interval_list); /** * snd_interval_ranges - refine the interval value from the list of ranges * @i: the interval value to refine * @count: the number of elements in the list of ranges * @ranges: the ranges list * @mask: the bit-mask to evaluate * * Refines the interval value from the list of ranges. * When mask is non-zero, only the elements corresponding to bit 1 are * evaluated. * * Return: Positive if the value is changed, zero if it's not changed, or a * negative error code. */ int snd_interval_ranges(struct snd_interval *i, unsigned int count, const struct snd_interval *ranges, unsigned int mask) { unsigned int k; struct snd_interval range_union; struct snd_interval range; if (!count) { snd_interval_none(i); return -EINVAL; } snd_interval_any(&range_union); range_union.min = UINT_MAX; range_union.max = 0; for (k = 0; k < count; k++) { if (mask && !(mask & (1 << k))) continue; snd_interval_copy(&range, &ranges[k]); if (snd_interval_refine(&range, i) < 0) continue; if (snd_interval_empty(&range)) continue; if (range.min < range_union.min) { range_union.min = range.min; range_union.openmin = 1; } if (range.min == range_union.min && !range.openmin) range_union.openmin = 0; if (range.max > range_union.max) { range_union.max = range.max; range_union.openmax = 1; } if (range.max == range_union.max && !range.openmax) range_union.openmax = 0; } return snd_interval_refine(i, &range_union); } EXPORT_SYMBOL(snd_interval_ranges); static int snd_interval_step(struct snd_interval *i, unsigned int step) { unsigned int n; int changed = 0; n = i->min % step; if (n != 0 || i->openmin) { i->min += step - n; i->openmin = 0; changed = 1; } n = i->max % step; if (n != 0 || i->openmax) { i->max -= n; i->openmax = 0; changed = 1; } if (snd_interval_checkempty(i)) { i->empty = 1; return -EINVAL; } return changed; } /* Info constraints helpers */ /** * snd_pcm_hw_rule_add - add the hw-constraint rule * @runtime: the pcm runtime instance * @cond: condition bits * @var: the variable to evaluate * @func: the evaluation function * @private: the private data pointer passed to function * @dep: the dependent variables * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_rule_add(struct snd_pcm_runtime *runtime, unsigned int cond, int var, snd_pcm_hw_rule_func_t func, void *private, int dep, ...) { struct snd_pcm_hw_constraints *constrs = &runtime->hw_constraints; struct snd_pcm_hw_rule *c; unsigned int k; va_list args; va_start(args, dep); if (constrs->rules_num >= constrs->rules_all) { struct snd_pcm_hw_rule *new; unsigned int new_rules = constrs->rules_all + 16; new = krealloc_array(constrs->rules, new_rules, sizeof(*c), GFP_KERNEL); if (!new) { va_end(args); return -ENOMEM; } constrs->rules = new; constrs->rules_all = new_rules; } c = &constrs->rules[constrs->rules_num]; c->cond = cond; c->func = func; c->var = var; c->private = private; k = 0; while (1) { if (snd_BUG_ON(k >= ARRAY_SIZE(c->deps))) { va_end(args); return -EINVAL; } c->deps[k++] = dep; if (dep < 0) break; dep = va_arg(args, int); } constrs->rules_num++; va_end(args); return 0; } EXPORT_SYMBOL(snd_pcm_hw_rule_add); /** * snd_pcm_hw_constraint_mask - apply the given bitmap mask constraint * @runtime: PCM runtime instance * @var: hw_params variable to apply the mask * @mask: the bitmap mask * * Apply the constraint of the given bitmap mask to a 32-bit mask parameter. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_mask(struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var, u_int32_t mask) { struct snd_pcm_hw_constraints *constrs = &runtime->hw_constraints; struct snd_mask *maskp = constrs_mask(constrs, var); *maskp->bits &= mask; memset(maskp->bits + 1, 0, (SNDRV_MASK_MAX-32) / 8); /* clear rest */ if (*maskp->bits == 0) return -EINVAL; return 0; } /** * snd_pcm_hw_constraint_mask64 - apply the given bitmap mask constraint * @runtime: PCM runtime instance * @var: hw_params variable to apply the mask * @mask: the 64bit bitmap mask * * Apply the constraint of the given bitmap mask to a 64-bit mask parameter. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_mask64(struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var, u_int64_t mask) { struct snd_pcm_hw_constraints *constrs = &runtime->hw_constraints; struct snd_mask *maskp = constrs_mask(constrs, var); maskp->bits[0] &= (u_int32_t)mask; maskp->bits[1] &= (u_int32_t)(mask >> 32); memset(maskp->bits + 2, 0, (SNDRV_MASK_MAX-64) / 8); /* clear rest */ if (! maskp->bits[0] && ! maskp->bits[1]) return -EINVAL; return 0; } EXPORT_SYMBOL(snd_pcm_hw_constraint_mask64); /** * snd_pcm_hw_constraint_integer - apply an integer constraint to an interval * @runtime: PCM runtime instance * @var: hw_params variable to apply the integer constraint * * Apply the constraint of integer to an interval parameter. * * Return: Positive if the value is changed, zero if it's not changed, or a * negative error code. */ int snd_pcm_hw_constraint_integer(struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var) { struct snd_pcm_hw_constraints *constrs = &runtime->hw_constraints; return snd_interval_setinteger(constrs_interval(constrs, var)); } EXPORT_SYMBOL(snd_pcm_hw_constraint_integer); /** * snd_pcm_hw_constraint_minmax - apply a min/max range constraint to an interval * @runtime: PCM runtime instance * @var: hw_params variable to apply the range * @min: the minimal value * @max: the maximal value * * Apply the min/max range constraint to an interval parameter. * * Return: Positive if the value is changed, zero if it's not changed, or a * negative error code. */ int snd_pcm_hw_constraint_minmax(struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var, unsigned int min, unsigned int max) { struct snd_pcm_hw_constraints *constrs = &runtime->hw_constraints; struct snd_interval t; t.min = min; t.max = max; t.openmin = t.openmax = 0; t.integer = 0; return snd_interval_refine(constrs_interval(constrs, var), &t); } EXPORT_SYMBOL(snd_pcm_hw_constraint_minmax); static int snd_pcm_hw_rule_list(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_pcm_hw_constraint_list *list = rule->private; return snd_interval_list(hw_param_interval(params, rule->var), list->count, list->list, list->mask); } /** * snd_pcm_hw_constraint_list - apply a list of constraints to a parameter * @runtime: PCM runtime instance * @cond: condition bits * @var: hw_params variable to apply the list constraint * @l: list * * Apply the list of constraints to an interval parameter. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_list(struct snd_pcm_runtime *runtime, unsigned int cond, snd_pcm_hw_param_t var, const struct snd_pcm_hw_constraint_list *l) { return snd_pcm_hw_rule_add(runtime, cond, var, snd_pcm_hw_rule_list, (void *)l, var, -1); } EXPORT_SYMBOL(snd_pcm_hw_constraint_list); static int snd_pcm_hw_rule_ranges(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_pcm_hw_constraint_ranges *r = rule->private; return snd_interval_ranges(hw_param_interval(params, rule->var), r->count, r->ranges, r->mask); } /** * snd_pcm_hw_constraint_ranges - apply list of range constraints to a parameter * @runtime: PCM runtime instance * @cond: condition bits * @var: hw_params variable to apply the list of range constraints * @r: ranges * * Apply the list of range constraints to an interval parameter. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_ranges(struct snd_pcm_runtime *runtime, unsigned int cond, snd_pcm_hw_param_t var, const struct snd_pcm_hw_constraint_ranges *r) { return snd_pcm_hw_rule_add(runtime, cond, var, snd_pcm_hw_rule_ranges, (void *)r, var, -1); } EXPORT_SYMBOL(snd_pcm_hw_constraint_ranges); static int snd_pcm_hw_rule_ratnums(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { const struct snd_pcm_hw_constraint_ratnums *r = rule->private; unsigned int num = 0, den = 0; int err; err = snd_interval_ratnum(hw_param_interval(params, rule->var), r->nrats, r->rats, &num, &den); if (err >= 0 && den && rule->var == SNDRV_PCM_HW_PARAM_RATE) { params->rate_num = num; params->rate_den = den; } return err; } /** * snd_pcm_hw_constraint_ratnums - apply ratnums constraint to a parameter * @runtime: PCM runtime instance * @cond: condition bits * @var: hw_params variable to apply the ratnums constraint * @r: struct snd_ratnums constriants * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_ratnums(struct snd_pcm_runtime *runtime, unsigned int cond, snd_pcm_hw_param_t var, const struct snd_pcm_hw_constraint_ratnums *r) { return snd_pcm_hw_rule_add(runtime, cond, var, snd_pcm_hw_rule_ratnums, (void *)r, var, -1); } EXPORT_SYMBOL(snd_pcm_hw_constraint_ratnums); static int snd_pcm_hw_rule_ratdens(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { const struct snd_pcm_hw_constraint_ratdens *r = rule->private; unsigned int num = 0, den = 0; int err = snd_interval_ratden(hw_param_interval(params, rule->var), r->nrats, r->rats, &num, &den); if (err >= 0 && den && rule->var == SNDRV_PCM_HW_PARAM_RATE) { params->rate_num = num; params->rate_den = den; } return err; } /** * snd_pcm_hw_constraint_ratdens - apply ratdens constraint to a parameter * @runtime: PCM runtime instance * @cond: condition bits * @var: hw_params variable to apply the ratdens constraint * @r: struct snd_ratdens constriants * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_ratdens(struct snd_pcm_runtime *runtime, unsigned int cond, snd_pcm_hw_param_t var, const struct snd_pcm_hw_constraint_ratdens *r) { return snd_pcm_hw_rule_add(runtime, cond, var, snd_pcm_hw_rule_ratdens, (void *)r, var, -1); } EXPORT_SYMBOL(snd_pcm_hw_constraint_ratdens); static int snd_pcm_hw_rule_msbits(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { unsigned int l = (unsigned long) rule->private; int width = l & 0xffff; unsigned int msbits = l >> 16; const struct snd_interval *i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_SAMPLE_BITS); if (!snd_interval_single(i)) return 0; if ((snd_interval_value(i) == width) || (width == 0 && snd_interval_value(i) > msbits)) params->msbits = min_not_zero(params->msbits, msbits); return 0; } /** * snd_pcm_hw_constraint_msbits - add a hw constraint msbits rule * @runtime: PCM runtime instance * @cond: condition bits * @width: sample bits width * @msbits: msbits width * * This constraint will set the number of most significant bits (msbits) if a * sample format with the specified width has been select. If width is set to 0 * the msbits will be set for any sample format with a width larger than the * specified msbits. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_msbits(struct snd_pcm_runtime *runtime, unsigned int cond, unsigned int width, unsigned int msbits) { unsigned long l = (msbits << 16) | width; return snd_pcm_hw_rule_add(runtime, cond, -1, snd_pcm_hw_rule_msbits, (void*) l, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1); } EXPORT_SYMBOL(snd_pcm_hw_constraint_msbits); static int snd_pcm_hw_rule_step(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { unsigned long step = (unsigned long) rule->private; return snd_interval_step(hw_param_interval(params, rule->var), step); } /** * snd_pcm_hw_constraint_step - add a hw constraint step rule * @runtime: PCM runtime instance * @cond: condition bits * @var: hw_params variable to apply the step constraint * @step: step size * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_step(struct snd_pcm_runtime *runtime, unsigned int cond, snd_pcm_hw_param_t var, unsigned long step) { return snd_pcm_hw_rule_add(runtime, cond, var, snd_pcm_hw_rule_step, (void *) step, var, -1); } EXPORT_SYMBOL(snd_pcm_hw_constraint_step); static int snd_pcm_hw_rule_pow2(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { static const unsigned int pow2_sizes[] = { 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15, 1<<16, 1<<17, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30 }; return snd_interval_list(hw_param_interval(params, rule->var), ARRAY_SIZE(pow2_sizes), pow2_sizes, 0); } /** * snd_pcm_hw_constraint_pow2 - add a hw constraint power-of-2 rule * @runtime: PCM runtime instance * @cond: condition bits * @var: hw_params variable to apply the power-of-2 constraint * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_constraint_pow2(struct snd_pcm_runtime *runtime, unsigned int cond, snd_pcm_hw_param_t var) { return snd_pcm_hw_rule_add(runtime, cond, var, snd_pcm_hw_rule_pow2, NULL, var, -1); } EXPORT_SYMBOL(snd_pcm_hw_constraint_pow2); static int snd_pcm_hw_rule_noresample_func(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { unsigned int base_rate = (unsigned int)(uintptr_t)rule->private; struct snd_interval *rate; rate = hw_param_interval(params, SNDRV_PCM_HW_PARAM_RATE); return snd_interval_list(rate, 1, &base_rate, 0); } /** * snd_pcm_hw_rule_noresample - add a rule to allow disabling hw resampling * @runtime: PCM runtime instance * @base_rate: the rate at which the hardware does not resample * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_hw_rule_noresample(struct snd_pcm_runtime *runtime, unsigned int base_rate) { return snd_pcm_hw_rule_add(runtime, SNDRV_PCM_HW_PARAMS_NORESAMPLE, SNDRV_PCM_HW_PARAM_RATE, snd_pcm_hw_rule_noresample_func, (void *)(uintptr_t)base_rate, SNDRV_PCM_HW_PARAM_RATE, -1); } EXPORT_SYMBOL(snd_pcm_hw_rule_noresample); static void _snd_pcm_hw_param_any(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var) { if (hw_is_mask(var)) { snd_mask_any(hw_param_mask(params, var)); params->cmask |= 1 << var; params->rmask |= 1 << var; return; } if (hw_is_interval(var)) { snd_interval_any(hw_param_interval(params, var)); params->cmask |= 1 << var; params->rmask |= 1 << var; return; } snd_BUG(); } void _snd_pcm_hw_params_any(struct snd_pcm_hw_params *params) { unsigned int k; memset(params, 0, sizeof(*params)); for (k = SNDRV_PCM_HW_PARAM_FIRST_MASK; k <= SNDRV_PCM_HW_PARAM_LAST_MASK; k++) _snd_pcm_hw_param_any(params, k); for (k = SNDRV_PCM_HW_PARAM_FIRST_INTERVAL; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) _snd_pcm_hw_param_any(params, k); params->info = ~0U; } EXPORT_SYMBOL(_snd_pcm_hw_params_any); /** * snd_pcm_hw_param_value - return @params field @var value * @params: the hw_params instance * @var: parameter to retrieve * @dir: pointer to the direction (-1,0,1) or %NULL * * Return: The value for field @var if it's fixed in configuration space * defined by @params. -%EINVAL otherwise. */ int snd_pcm_hw_param_value(const struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, int *dir) { if (hw_is_mask(var)) { const struct snd_mask *mask = hw_param_mask_c(params, var); if (!snd_mask_single(mask)) return -EINVAL; if (dir) *dir = 0; return snd_mask_value(mask); } if (hw_is_interval(var)) { const struct snd_interval *i = hw_param_interval_c(params, var); if (!snd_interval_single(i)) return -EINVAL; if (dir) *dir = i->openmin; return snd_interval_value(i); } return -EINVAL; } EXPORT_SYMBOL(snd_pcm_hw_param_value); void _snd_pcm_hw_param_setempty(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var) { if (hw_is_mask(var)) { snd_mask_none(hw_param_mask(params, var)); params->cmask |= 1 << var; params->rmask |= 1 << var; } else if (hw_is_interval(var)) { snd_interval_none(hw_param_interval(params, var)); params->cmask |= 1 << var; params->rmask |= 1 << var; } else { snd_BUG(); } } EXPORT_SYMBOL(_snd_pcm_hw_param_setempty); static int _snd_pcm_hw_param_first(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var) { int changed; if (hw_is_mask(var)) changed = snd_mask_refine_first(hw_param_mask(params, var)); else if (hw_is_interval(var)) changed = snd_interval_refine_first(hw_param_interval(params, var)); else return -EINVAL; if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /** * snd_pcm_hw_param_first - refine config space and return minimum value * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @dir: pointer to the direction (-1,0,1) or %NULL * * Inside configuration space defined by @params remove from @var all * values > minimum. Reduce configuration space accordingly. * * Return: The minimum, or a negative error code on failure. */ int snd_pcm_hw_param_first(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, int *dir) { int changed = _snd_pcm_hw_param_first(params, var); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return snd_pcm_hw_param_value(params, var, dir); } EXPORT_SYMBOL(snd_pcm_hw_param_first); static int _snd_pcm_hw_param_last(struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var) { int changed; if (hw_is_mask(var)) changed = snd_mask_refine_last(hw_param_mask(params, var)); else if (hw_is_interval(var)) changed = snd_interval_refine_last(hw_param_interval(params, var)); else return -EINVAL; if (changed > 0) { params->cmask |= 1 << var; params->rmask |= 1 << var; } return changed; } /** * snd_pcm_hw_param_last - refine config space and return maximum value * @pcm: PCM instance * @params: the hw_params instance * @var: parameter to retrieve * @dir: pointer to the direction (-1,0,1) or %NULL * * Inside configuration space defined by @params remove from @var all * values < maximum. Reduce configuration space accordingly. * * Return: The maximum, or a negative error code on failure. */ int snd_pcm_hw_param_last(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params, snd_pcm_hw_param_t var, int *dir) { int changed = _snd_pcm_hw_param_last(params, var); if (changed < 0) return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); if (err < 0) return err; } return snd_pcm_hw_param_value(params, var, dir); } EXPORT_SYMBOL(snd_pcm_hw_param_last); static int snd_pcm_lib_ioctl_reset(struct snd_pcm_substream *substream, void *arg) { struct snd_pcm_runtime *runtime = substream->runtime; unsigned long flags; snd_pcm_stream_lock_irqsave(substream, flags); if (snd_pcm_running(substream) && snd_pcm_update_hw_ptr(substream) >= 0) runtime->status->hw_ptr %= runtime->buffer_size; else { runtime->status->hw_ptr = 0; runtime->hw_ptr_wrap = 0; } snd_pcm_stream_unlock_irqrestore(substream, flags); return 0; } static int snd_pcm_lib_ioctl_channel_info(struct snd_pcm_substream *substream, void *arg) { struct snd_pcm_channel_info *info = arg; struct snd_pcm_runtime *runtime = substream->runtime; int width; if (!(runtime->info & SNDRV_PCM_INFO_MMAP)) { info->offset = -1; return 0; } width = snd_pcm_format_physical_width(runtime->format); if (width < 0) return width; info->offset = 0; switch (runtime->access) { case SNDRV_PCM_ACCESS_MMAP_INTERLEAVED: case SNDRV_PCM_ACCESS_RW_INTERLEAVED: info->first = info->channel * width; info->step = runtime->channels * width; break; case SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED: case SNDRV_PCM_ACCESS_RW_NONINTERLEAVED: { size_t size = runtime->dma_bytes / runtime->channels; info->first = info->channel * size * 8; info->step = width; break; } default: snd_BUG(); break; } return 0; } static int snd_pcm_lib_ioctl_fifo_size(struct snd_pcm_substream *substream, void *arg) { struct snd_pcm_hw_params *params = arg; snd_pcm_format_t format; int channels; ssize_t frame_size; params->fifo_size = substream->runtime->hw.fifo_size; if (!(substream->runtime->hw.info & SNDRV_PCM_INFO_FIFO_IN_FRAMES)) { format = params_format(params); channels = params_channels(params); frame_size = snd_pcm_format_size(format, channels); if (frame_size > 0) params->fifo_size /= frame_size; } return 0; } /** * snd_pcm_lib_ioctl - a generic PCM ioctl callback * @substream: the pcm substream instance * @cmd: ioctl command * @arg: ioctl argument * * Processes the generic ioctl commands for PCM. * Can be passed as the ioctl callback for PCM ops. * * Return: Zero if successful, or a negative error code on failure. */ int snd_pcm_lib_ioctl(struct snd_pcm_substream *substream, unsigned int cmd, void *arg) { switch (cmd) { case SNDRV_PCM_IOCTL1_RESET: return snd_pcm_lib_ioctl_reset(substream, arg); case SNDRV_PCM_IOCTL1_CHANNEL_INFO: return snd_pcm_lib_ioctl_channel_info(substream, arg); case SNDRV_PCM_IOCTL1_FIFO_SIZE: return snd_pcm_lib_ioctl_fifo_size(substream, arg); } return -ENXIO; } EXPORT_SYMBOL(snd_pcm_lib_ioctl); /** * snd_pcm_period_elapsed_under_stream_lock() - update the status of runtime for the next period * under acquired lock of PCM substream. * @substream: the instance of pcm substream. * * This function is called when the batch of audio data frames as the same size as the period of * buffer is already processed in audio data transmission. * * The call of function updates the status of runtime with the latest position of audio data * transmission, checks overrun and underrun over buffer, awaken user processes from waiting for * available audio data frames, sampling audio timestamp, and performs stop or drain the PCM * substream according to configured threshold. * * The function is intended to use for the case that PCM driver operates audio data frames under * acquired lock of PCM substream; e.g. in callback of any operation of &snd_pcm_ops in process * context. In any interrupt context, it's preferrable to use ``snd_pcm_period_elapsed()`` instead * since lock of PCM substream should be acquired in advance. * * Developer should pay enough attention that some callbacks in &snd_pcm_ops are done by the call of * function: * * - .pointer - to retrieve current position of audio data transmission by frame count or XRUN state. * - .trigger - with SNDRV_PCM_TRIGGER_STOP at XRUN or DRAINING state. * - .get_time_info - to retrieve audio time stamp if needed. * * Even if more than one periods have elapsed since the last call, you have to call this only once. */ void snd_pcm_period_elapsed_under_stream_lock(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; if (PCM_RUNTIME_CHECK(substream)) return; runtime = substream->runtime; if (!snd_pcm_running(substream) || snd_pcm_update_hw_ptr0(substream, 1) < 0) goto _end; #ifdef CONFIG_SND_PCM_TIMER if (substream->timer_running) snd_timer_interrupt(substream->timer, 1); #endif _end: snd_kill_fasync(runtime->fasync, SIGIO, POLL_IN); } EXPORT_SYMBOL(snd_pcm_period_elapsed_under_stream_lock); /** * snd_pcm_period_elapsed() - update the status of runtime for the next period by acquiring lock of * PCM substream. * @substream: the instance of PCM substream. * * This function is mostly similar to ``snd_pcm_period_elapsed_under_stream_lock()`` except for * acquiring lock of PCM substream voluntarily. * * It's typically called by any type of IRQ handler when hardware IRQ occurs to notify event that * the batch of audio data frames as the same size as the period of buffer is already processed in * audio data transmission. */ void snd_pcm_period_elapsed(struct snd_pcm_substream *substream) { unsigned long flags; if (snd_BUG_ON(!substream)) return; snd_pcm_stream_lock_irqsave(substream, flags); snd_pcm_period_elapsed_under_stream_lock(substream); snd_pcm_stream_unlock_irqrestore(substream, flags); } EXPORT_SYMBOL(snd_pcm_period_elapsed); /* * Wait until avail_min data becomes available * Returns a negative error code if any error occurs during operation. * The available space is stored on availp. When err = 0 and avail = 0 * on the capture stream, it indicates the stream is in DRAINING state. */ static int wait_for_avail(struct snd_pcm_substream *substream, snd_pcm_uframes_t *availp) { struct snd_pcm_runtime *runtime = substream->runtime; int is_playback = substream->stream == SNDRV_PCM_STREAM_PLAYBACK; wait_queue_entry_t wait; int err = 0; snd_pcm_uframes_t avail = 0; long wait_time, tout; init_waitqueue_entry(&wait, current); set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&runtime->tsleep, &wait); if (runtime->no_period_wakeup) wait_time = MAX_SCHEDULE_TIMEOUT; else { /* use wait time from substream if available */ if (substream->wait_time) { wait_time = substream->wait_time; } else { wait_time = 100; if (runtime->rate) { long t = runtime->buffer_size * 1100 / runtime->rate; wait_time = max(t, wait_time); } } wait_time = msecs_to_jiffies(wait_time); } for (;;) { if (signal_pending(current)) { err = -ERESTARTSYS; break; } /* * We need to check if space became available already * (and thus the wakeup happened already) first to close * the race of space already having become available. * This check must happen after been added to the waitqueue * and having current state be INTERRUPTIBLE. */ avail = snd_pcm_avail(substream); if (avail >= runtime->twake) break; snd_pcm_stream_unlock_irq(substream); tout = schedule_timeout(wait_time); snd_pcm_stream_lock_irq(substream); set_current_state(TASK_INTERRUPTIBLE); switch (runtime->state) { case SNDRV_PCM_STATE_SUSPENDED: err = -ESTRPIPE; goto _endloop; case SNDRV_PCM_STATE_XRUN: err = -EPIPE; goto _endloop; case SNDRV_PCM_STATE_DRAINING: if (is_playback) err = -EPIPE; else avail = 0; /* indicate draining */ goto _endloop; case SNDRV_PCM_STATE_OPEN: case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_DISCONNECTED: err = -EBADFD; goto _endloop; case SNDRV_PCM_STATE_PAUSED: continue; } if (!tout) { pcm_dbg(substream->pcm, "%s timeout (DMA or IRQ trouble?)\n", is_playback ? "playback write" : "capture read"); err = -EIO; break; } } _endloop: set_current_state(TASK_RUNNING); remove_wait_queue(&runtime->tsleep, &wait); *availp = avail; return err; } typedef int (*pcm_transfer_f)(struct snd_pcm_substream *substream, int channel, unsigned long hwoff, struct iov_iter *iter, unsigned long bytes); typedef int (*pcm_copy_f)(struct snd_pcm_substream *, snd_pcm_uframes_t, void *, snd_pcm_uframes_t, snd_pcm_uframes_t, pcm_transfer_f, bool); /* calculate the target DMA-buffer position to be written/read */ static void *get_dma_ptr(struct snd_pcm_runtime *runtime, int channel, unsigned long hwoff) { return runtime->dma_area + hwoff + channel * (runtime->dma_bytes / runtime->channels); } /* default copy ops for write; used for both interleaved and non- modes */ static int default_write_copy(struct snd_pcm_substream *substream, int channel, unsigned long hwoff, struct iov_iter *iter, unsigned long bytes) { if (copy_from_iter(get_dma_ptr(substream->runtime, channel, hwoff), bytes, iter) != bytes) return -EFAULT; return 0; } /* fill silence instead of copy data; called as a transfer helper * from __snd_pcm_lib_write() or directly from noninterleaved_copy() when * a NULL buffer is passed */ static int fill_silence(struct snd_pcm_substream *substream, int channel, unsigned long hwoff, struct iov_iter *iter, unsigned long bytes) { struct snd_pcm_runtime *runtime = substream->runtime; if (substream->stream != SNDRV_PCM_STREAM_PLAYBACK) return 0; if (substream->ops->fill_silence) return substream->ops->fill_silence(substream, channel, hwoff, bytes); snd_pcm_format_set_silence(runtime->format, get_dma_ptr(runtime, channel, hwoff), bytes_to_samples(runtime, bytes)); return 0; } /* default copy ops for read; used for both interleaved and non- modes */ static int default_read_copy(struct snd_pcm_substream *substream, int channel, unsigned long hwoff, struct iov_iter *iter, unsigned long bytes) { if (copy_to_iter(get_dma_ptr(substream->runtime, channel, hwoff), bytes, iter) != bytes) return -EFAULT; return 0; } /* call transfer with the filled iov_iter */ static int do_transfer(struct snd_pcm_substream *substream, int c, unsigned long hwoff, void *data, unsigned long bytes, pcm_transfer_f transfer, bool in_kernel) { struct iov_iter iter; int err, type; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) type = ITER_SOURCE; else type = ITER_DEST; if (in_kernel) { struct kvec kvec = { data, bytes }; iov_iter_kvec(&iter, type, &kvec, 1, bytes); return transfer(substream, c, hwoff, &iter, bytes); } err = import_ubuf(type, (__force void __user *)data, bytes, &iter); if (err) return err; return transfer(substream, c, hwoff, &iter, bytes); } /* call transfer function with the converted pointers and sizes; * for interleaved mode, it's one shot for all samples */ static int interleaved_copy(struct snd_pcm_substream *substream, snd_pcm_uframes_t hwoff, void *data, snd_pcm_uframes_t off, snd_pcm_uframes_t frames, pcm_transfer_f transfer, bool in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; /* convert to bytes */ hwoff = frames_to_bytes(runtime, hwoff); off = frames_to_bytes(runtime, off); frames = frames_to_bytes(runtime, frames); return do_transfer(substream, 0, hwoff, data + off, frames, transfer, in_kernel); } /* call transfer function with the converted pointers and sizes for each * non-interleaved channel; when buffer is NULL, silencing instead of copying */ static int noninterleaved_copy(struct snd_pcm_substream *substream, snd_pcm_uframes_t hwoff, void *data, snd_pcm_uframes_t off, snd_pcm_uframes_t frames, pcm_transfer_f transfer, bool in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; int channels = runtime->channels; void **bufs = data; int c, err; /* convert to bytes; note that it's not frames_to_bytes() here. * in non-interleaved mode, we copy for each channel, thus * each copy is n_samples bytes x channels = whole frames. */ off = samples_to_bytes(runtime, off); frames = samples_to_bytes(runtime, frames); hwoff = samples_to_bytes(runtime, hwoff); for (c = 0; c < channels; ++c, ++bufs) { if (!data || !*bufs) err = fill_silence(substream, c, hwoff, NULL, frames); else err = do_transfer(substream, c, hwoff, *bufs + off, frames, transfer, in_kernel); if (err < 0) return err; } return 0; } /* fill silence on the given buffer position; * called from snd_pcm_playback_silence() */ static int fill_silence_frames(struct snd_pcm_substream *substream, snd_pcm_uframes_t off, snd_pcm_uframes_t frames) { if (substream->runtime->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED || substream->runtime->access == SNDRV_PCM_ACCESS_MMAP_INTERLEAVED) return interleaved_copy(substream, off, NULL, 0, frames, fill_silence, true); else return noninterleaved_copy(substream, off, NULL, 0, frames, fill_silence, true); } /* sanity-check for read/write methods */ static int pcm_sanity_check(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (snd_BUG_ON(!substream->ops->copy && !runtime->dma_area)) return -EINVAL; if (runtime->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; return 0; } static int pcm_accessible_state(struct snd_pcm_runtime *runtime) { switch (runtime->state) { case SNDRV_PCM_STATE_PREPARED: case SNDRV_PCM_STATE_RUNNING: case SNDRV_PCM_STATE_PAUSED: return 0; case SNDRV_PCM_STATE_XRUN: return -EPIPE; case SNDRV_PCM_STATE_SUSPENDED: return -ESTRPIPE; default: return -EBADFD; } } /* update to the given appl_ptr and call ack callback if needed; * when an error is returned, take back to the original value */ int pcm_lib_apply_appl_ptr(struct snd_pcm_substream *substream, snd_pcm_uframes_t appl_ptr) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_uframes_t old_appl_ptr = runtime->control->appl_ptr; snd_pcm_sframes_t diff; int ret; if (old_appl_ptr == appl_ptr) return 0; if (appl_ptr >= runtime->boundary) return -EINVAL; /* * check if a rewind is requested by the application */ if (substream->runtime->info & SNDRV_PCM_INFO_NO_REWINDS) { diff = appl_ptr - old_appl_ptr; if (diff >= 0) { if (diff > runtime->buffer_size) return -EINVAL; } else { if (runtime->boundary + diff > runtime->buffer_size) return -EINVAL; } } runtime->control->appl_ptr = appl_ptr; if (substream->ops->ack) { ret = substream->ops->ack(substream); if (ret < 0) { runtime->control->appl_ptr = old_appl_ptr; if (ret == -EPIPE) __snd_pcm_xrun(substream); return ret; } } trace_applptr(substream, old_appl_ptr, appl_ptr); return 0; } /* the common loop for read/write data */ snd_pcm_sframes_t __snd_pcm_lib_xfer(struct snd_pcm_substream *substream, void *data, bool interleaved, snd_pcm_uframes_t size, bool in_kernel) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_uframes_t xfer = 0; snd_pcm_uframes_t offset = 0; snd_pcm_uframes_t avail; pcm_copy_f writer; pcm_transfer_f transfer; bool nonblock; bool is_playback; int err; err = pcm_sanity_check(substream); if (err < 0) return err; is_playback = substream->stream == SNDRV_PCM_STREAM_PLAYBACK; if (interleaved) { if (runtime->access != SNDRV_PCM_ACCESS_RW_INTERLEAVED && runtime->channels > 1) return -EINVAL; writer = interleaved_copy; } else { if (runtime->access != SNDRV_PCM_ACCESS_RW_NONINTERLEAVED) return -EINVAL; writer = noninterleaved_copy; } if (!data) { if (is_playback) transfer = fill_silence; else return -EINVAL; } else { if (substream->ops->copy) transfer = substream->ops->copy; else transfer = is_playback ? default_write_copy : default_read_copy; } if (size == 0) return 0; nonblock = !!(substream->f_flags & O_NONBLOCK); snd_pcm_stream_lock_irq(substream); err = pcm_accessible_state(runtime); if (err < 0) goto _end_unlock; runtime->twake = runtime->control->avail_min ? : 1; if (runtime->state == SNDRV_PCM_STATE_RUNNING) snd_pcm_update_hw_ptr(substream); /* * If size < start_threshold, wait indefinitely. Another * thread may start capture */ if (!is_playback && runtime->state == SNDRV_PCM_STATE_PREPARED && size >= runtime->start_threshold) { err = snd_pcm_start(substream); if (err < 0) goto _end_unlock; } avail = snd_pcm_avail(substream); while (size > 0) { snd_pcm_uframes_t frames, appl_ptr, appl_ofs; snd_pcm_uframes_t cont; if (!avail) { if (!is_playback && runtime->state == SNDRV_PCM_STATE_DRAINING) { snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP); goto _end_unlock; } if (nonblock) { err = -EAGAIN; goto _end_unlock; } runtime->twake = min_t(snd_pcm_uframes_t, size, runtime->control->avail_min ? : 1); err = wait_for_avail(substream, &avail); if (err < 0) goto _end_unlock; if (!avail) continue; /* draining */ } frames = size > avail ? avail : size; appl_ptr = READ_ONCE(runtime->control->appl_ptr); appl_ofs = appl_ptr % runtime->buffer_size; cont = runtime->buffer_size - appl_ofs; if (frames > cont) frames = cont; if (snd_BUG_ON(!frames)) { err = -EINVAL; goto _end_unlock; } if (!atomic_inc_unless_negative(&runtime->buffer_accessing)) { err = -EBUSY; goto _end_unlock; } snd_pcm_stream_unlock_irq(substream); if (!is_playback) snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_CPU); err = writer(substream, appl_ofs, data, offset, frames, transfer, in_kernel); if (is_playback) snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE); snd_pcm_stream_lock_irq(substream); atomic_dec(&runtime->buffer_accessing); if (err < 0) goto _end_unlock; err = pcm_accessible_state(runtime); if (err < 0) goto _end_unlock; appl_ptr += frames; if (appl_ptr >= runtime->boundary) appl_ptr -= runtime->boundary; err = pcm_lib_apply_appl_ptr(substream, appl_ptr); if (err < 0) goto _end_unlock; offset += frames; size -= frames; xfer += frames; avail -= frames; if (is_playback && runtime->state == SNDRV_PCM_STATE_PREPARED && snd_pcm_playback_hw_avail(runtime) >= (snd_pcm_sframes_t)runtime->start_threshold) { err = snd_pcm_start(substream); if (err < 0) goto _end_unlock; } } _end_unlock: runtime->twake = 0; if (xfer > 0 && err >= 0) snd_pcm_update_state(substream, runtime); snd_pcm_stream_unlock_irq(substream); return xfer > 0 ? (snd_pcm_sframes_t)xfer : err; } EXPORT_SYMBOL(__snd_pcm_lib_xfer); /* * standard channel mapping helpers */ /* default channel maps for multi-channel playbacks, up to 8 channels */ const struct snd_pcm_chmap_elem snd_pcm_std_chmaps[] = { { .channels = 1, .map = { SNDRV_CHMAP_MONO } }, { .channels = 2, .map = { SNDRV_CHMAP_FL, SNDRV_CHMAP_FR } }, { .channels = 4, .map = { SNDRV_CHMAP_FL, SNDRV_CHMAP_FR, SNDRV_CHMAP_RL, SNDRV_CHMAP_RR } }, { .channels = 6, .map = { SNDRV_CHMAP_FL, SNDRV_CHMAP_FR, SNDRV_CHMAP_RL, SNDRV_CHMAP_RR, SNDRV_CHMAP_FC, SNDRV_CHMAP_LFE } }, { .channels = 8, .map = { SNDRV_CHMAP_FL, SNDRV_CHMAP_FR, SNDRV_CHMAP_RL, SNDRV_CHMAP_RR, SNDRV_CHMAP_FC, SNDRV_CHMAP_LFE, SNDRV_CHMAP_SL, SNDRV_CHMAP_SR } }, { } }; EXPORT_SYMBOL_GPL(snd_pcm_std_chmaps); /* alternative channel maps with CLFE <-> surround swapped for 6/8 channels */ const struct snd_pcm_chmap_elem snd_pcm_alt_chmaps[] = { { .channels = 1, .map = { SNDRV_CHMAP_MONO } }, { .channels = 2, .map = { SNDRV_CHMAP_FL, SNDRV_CHMAP_FR } }, { .channels = 4, .map = { SNDRV_CHMAP_FL, SNDRV_CHMAP_FR, SNDRV_CHMAP_RL, SNDRV_CHMAP_RR } }, { .channels = 6, .map = { SNDRV_CHMAP_FL, SNDRV_CHMAP_FR, SNDRV_CHMAP_FC, SNDRV_CHMAP_LFE, SNDRV_CHMAP_RL, SNDRV_CHMAP_RR } }, { .channels = 8, .map = { SNDRV_CHMAP_FL, SNDRV_CHMAP_FR, SNDRV_CHMAP_FC, SNDRV_CHMAP_LFE, SNDRV_CHMAP_RL, SNDRV_CHMAP_RR, SNDRV_CHMAP_SL, SNDRV_CHMAP_SR } }, { } }; EXPORT_SYMBOL_GPL(snd_pcm_alt_chmaps); static bool valid_chmap_channels(const struct snd_pcm_chmap *info, int ch) { if (ch > info->max_channels) return false; return !info->channel_mask || (info->channel_mask & (1U << ch)); } static int pcm_chmap_ctl_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { struct snd_pcm_chmap *info = snd_kcontrol_chip(kcontrol); uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = info->max_channels; uinfo->value.integer.min = 0; uinfo->value.integer.max = SNDRV_CHMAP_LAST; return 0; } /* get callback for channel map ctl element * stores the channel position firstly matching with the current channels */ static int pcm_chmap_ctl_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_pcm_chmap *info = snd_kcontrol_chip(kcontrol); unsigned int idx = snd_ctl_get_ioffidx(kcontrol, &ucontrol->id); struct snd_pcm_substream *substream; const struct snd_pcm_chmap_elem *map; if (!info->chmap) return -EINVAL; substream = snd_pcm_chmap_substream(info, idx); if (!substream) return -ENODEV; memset(ucontrol->value.integer.value, 0, sizeof(long) * info->max_channels); if (!substream->runtime) return 0; /* no channels set */ for (map = info->chmap; map->channels; map++) { int i; if (map->channels == substream->runtime->channels && valid_chmap_channels(info, map->channels)) { for (i = 0; i < map->channels; i++) ucontrol->value.integer.value[i] = map->map[i]; return 0; } } return -EINVAL; } /* tlv callback for channel map ctl element * expands the pre-defined channel maps in a form of TLV */ static int pcm_chmap_ctl_tlv(struct snd_kcontrol *kcontrol, int op_flag, unsigned int size, unsigned int __user *tlv) { struct snd_pcm_chmap *info = snd_kcontrol_chip(kcontrol); const struct snd_pcm_chmap_elem *map; unsigned int __user *dst; int c, count = 0; if (!info->chmap) return -EINVAL; if (size < 8) return -ENOMEM; if (put_user(SNDRV_CTL_TLVT_CONTAINER, tlv)) return -EFAULT; size -= 8; dst = tlv + 2; for (map = info->chmap; map->channels; map++) { int chs_bytes = map->channels * 4; if (!valid_chmap_channels(info, map->channels)) continue; if (size < 8) return -ENOMEM; if (put_user(SNDRV_CTL_TLVT_CHMAP_FIXED, dst) || put_user(chs_bytes, dst + 1)) return -EFAULT; dst += 2; size -= 8; count += 8; if (size < chs_bytes) return -ENOMEM; size -= chs_bytes; count += chs_bytes; for (c = 0; c < map->channels; c++) { if (put_user(map->map[c], dst)) return -EFAULT; dst++; } } if (put_user(count, tlv + 1)) return -EFAULT; return 0; } static void pcm_chmap_ctl_private_free(struct snd_kcontrol *kcontrol) { struct snd_pcm_chmap *info = snd_kcontrol_chip(kcontrol); info->pcm->streams[info->stream].chmap_kctl = NULL; kfree(info); } /** * snd_pcm_add_chmap_ctls - create channel-mapping control elements * @pcm: the assigned PCM instance * @stream: stream direction * @chmap: channel map elements (for query) * @max_channels: the max number of channels for the stream * @private_value: the value passed to each kcontrol's private_value field * @info_ret: store struct snd_pcm_chmap instance if non-NULL * * Create channel-mapping control elements assigned to the given PCM stream(s). * Return: Zero if successful, or a negative error value. */ int snd_pcm_add_chmap_ctls(struct snd_pcm *pcm, int stream, const struct snd_pcm_chmap_elem *chmap, int max_channels, unsigned long private_value, struct snd_pcm_chmap **info_ret) { struct snd_pcm_chmap *info; struct snd_kcontrol_new knew = { .iface = SNDRV_CTL_ELEM_IFACE_PCM, .access = SNDRV_CTL_ELEM_ACCESS_READ | SNDRV_CTL_ELEM_ACCESS_TLV_READ | SNDRV_CTL_ELEM_ACCESS_TLV_CALLBACK, .info = pcm_chmap_ctl_info, .get = pcm_chmap_ctl_get, .tlv.c = pcm_chmap_ctl_tlv, }; int err; if (WARN_ON(pcm->streams[stream].chmap_kctl)) return -EBUSY; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; info->pcm = pcm; info->stream = stream; info->chmap = chmap; info->max_channels = max_channels; if (stream == SNDRV_PCM_STREAM_PLAYBACK) knew.name = "Playback Channel Map"; else knew.name = "Capture Channel Map"; knew.device = pcm->device; knew.count = pcm->streams[stream].substream_count; knew.private_value = private_value; info->kctl = snd_ctl_new1(&knew, info); if (!info->kctl) { kfree(info); return -ENOMEM; } info->kctl->private_free = pcm_chmap_ctl_private_free; err = snd_ctl_add(pcm->card, info->kctl); if (err < 0) return err; pcm->streams[stream].chmap_kctl = info->kctl; if (info_ret) *info_ret = info; return 0; } EXPORT_SYMBOL_GPL(snd_pcm_add_chmap_ctls);
24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP protocol. * * Version: @(#)tcp.h 1.0.2 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_TCP_H #define _LINUX_TCP_H #include <linux/skbuff.h> #include <linux/win_minmax.h> #include <net/sock.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <uapi/linux/tcp.h> static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_transport_header(skb); } static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) { return th->doff * 4; } static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) { return __tcp_hdrlen(tcp_hdr(skb)); } static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_inner_transport_header(skb); } static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb) { return inner_tcp_hdr(skb)->doff * 4; } /** * skb_tcp_all_headers - Returns size of all headers for a TCP packet * @skb: buffer * * Used in TX path, for a packet known to be a TCP one. * * if (skb_is_gso(skb)) { * int hlen = skb_tcp_all_headers(skb); * ... */ static inline int skb_tcp_all_headers(const struct sk_buff *skb) { return skb_transport_offset(skb) + tcp_hdrlen(skb); } /** * skb_inner_tcp_all_headers - Returns size of all headers for an encap TCP packet * @skb: buffer * * Used in TX path, for a packet known to be a TCP one. * * if (skb_is_gso(skb) && skb->encapsulation) { * int hlen = skb_inner_tcp_all_headers(skb); * ... */ static inline int skb_inner_tcp_all_headers(const struct sk_buff *skb) { return skb_inner_transport_offset(skb) + inner_tcp_hdrlen(skb); } static inline unsigned int tcp_optlen(const struct sk_buff *skb) { return (tcp_hdr(skb)->doff - 5) * 4; } /* TCP Fast Open */ #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ /* TCP Fast Open Cookie as stored in memory */ struct tcp_fastopen_cookie { __le64 val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))]; s8 len; bool exp; /* In RFC6994 experimental option format */ }; /* This defines a selective acknowledgement block. */ struct tcp_sack_block_wire { __be32 start_seq; __be32 end_seq; }; struct tcp_sack_block { u32 start_seq; u32 end_seq; }; /*These are used to set the sack_ok field in struct tcp_options_received */ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */ tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ dsack : 1, /* D-SACK is scheduled */ wscale_ok : 1, /* Wscale seen on SYN packet */ sack_ok : 3, /* SACK seen on SYN packet */ smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 saw_unknown:1, /* Received unknown option */ unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; rx_opt->wscale_ok = rx_opt->snd_wscale = 0; #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif } /* This is the max number of SACKS that we'll generate and process. It's safe * to increase this, although since: * size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8) * only four options will fit in a standard TCP header */ #define TCP_NUM_SACKS 4 struct tcp_request_sock_ops; struct tcp_request_sock { struct inet_request_sock req; const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; bool req_usec_ts; #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif u32 txhash; u32 rcv_isn; u32 snt_isn; u32 ts_off; u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# * after data-in-SYN. */ u8 syn_tos; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; bool used_tcp_ao; #endif }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) { return (struct tcp_request_sock *)req; } static inline bool tcp_rsk_used_ao(const struct request_sock *req) { #ifndef CONFIG_TCP_AO return false; #else return tcp_rsk(req)->used_tcp_ao; #endif } #define TCP_RMEM_TO_WIN_SCALE 8 struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ u16 gso_segs; /* Max number of segs per GSO packet */ /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn * total number of segments in. */ u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut * The total number of segments sent. */ u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut * total number of data bytes sent. */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 compressed_ack_rcv_nxt; u32 tsoffset; /* timestamp offset */ struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ u32 snd_wl1; /* Sequence for window update */ u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ u32 rtt_us; /* Associated RTT */ u32 end_seq; /* Ending TCP sequence of the skb */ u32 last_delivered; /* tp->delivered at last reo_wnd adj */ u8 reo_wnd_steps; /* Allowed reordering window */ #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ advanced:1; /* mstamp advanced since last lost marking */ } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ tcp_usec_ts:1, /* TSval values in usec */ unused:4; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ fastopen_client_fail:2; /* reason why fastopen failed */ u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 save_syn:2, /* Save headers of SYN packet */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ /* RTT measurement */ u64 tcp_mstamp; /* most recent packet received/sent */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ struct minmax rtt_min; u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ u32 max_packets_out; /* max packets_out in last window */ u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 reordering; /* Packet reordering metric. */ u32 reord_seen; /* number of data packet reordering events */ u32 snd_up; /* Urgent pointer */ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_ssthresh; /* Slow start size threshold */ u32 snd_cwnd; /* Sending congestion window */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 lost; /* Total data packets lost incl. rexmits */ u32 app_limited; /* limited until "delivered" reaches this val */ u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; struct sk_buff *retransmit_skb_hint; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block recv_sack_cache[4]; struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */ int lost_cnt_hint; u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans * Total data bytes retransmitted */ u32 total_retrans; /* Total retransmits for entire connection */ u32 rto_stamp; /* Start time (ms) of last CA_Loss recovery */ u16 total_rto; /* Total number of RTO timeouts, including * SYN/SYN-ACK and recurring timeouts. */ u16 total_rto_recoveries; /* Total number of RTO recoveries, * including any unfinished recovery. */ u32 total_rto_time; /* ms spent in (completed) RTO recoveries. */ u32 urg_seq; /* Seq of received urgent pointer */ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; /* Sock_ops bpf program related variables */ #ifdef CONFIG_BPF u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ u8 bpf_chg_cc_inprogress:1; /* In the middle of * bpf_setsockopt(TCP_CONGESTION), * it is to avoid the bpf_tcp_cc->init() * to recur itself by calling * bpf_setsockopt(TCP_CONGESTION, "itself"). */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif u16 timeout_rehash; /* Timeout-triggered rehash attempts */ u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ /* Receiver side RTT estimation */ u32 rcv_rtt_last_tsecr; struct { u32 rtt_us; u32 seq; u64 time; } rcv_rtt_est; /* Receiver queue space */ struct { u32 space; u32 seq; u64 time; } rcvq_space; /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* TCP AF-Specific parts; only used by TCP-AO/MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; #ifdef CONFIG_TCP_MD5SIG /* TCP MD5 Signature Option information */ struct tcp_md5sig_info __rcu *md5sig_info; #endif #ifdef CONFIG_TCP_AO struct tcp_ao_info __rcu *ao_info; #endif #endif /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; struct saved_syn *saved_syn; }; enum tsq_enum { TSQ_THROTTLED, TSQ_QUEUED, TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call * tcp_v{4|6}_mtu_reduced() */ TCP_ACK_DEFERRED, /* TX pure ack is deferred */ }; enum tsq_flags { TSQF_THROTTLED = BIT(TSQ_THROTTLED), TSQF_QUEUED = BIT(TSQ_QUEUED), TCPF_TSQ_DEFERRED = BIT(TCP_TSQ_DEFERRED), TCPF_WRITE_TIMER_DEFERRED = BIT(TCP_WRITE_TIMER_DEFERRED), TCPF_DELACK_TIMER_DEFERRED = BIT(TCP_DELACK_TIMER_DEFERRED), TCPF_MTU_REDUCED_DEFERRED = BIT(TCP_MTU_REDUCED_DEFERRED), TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED), }; #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) /* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket. * Used in context of (lockless) tcp listeners. */ #define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) struct tcp_timewait_sock { struct inet_timewait_sock tw_sk; #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt #define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt u32 tw_rcv_wnd; u32 tw_ts_offset; u32 tw_ts_recent; /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; int tw_ts_recent_stamp; u32 tw_tx_delay; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif #ifdef CONFIG_TCP_AO struct tcp_ao_info __rcu *ao_info; #endif }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) { return (struct tcp_timewait_sock *)sk; } static inline bool tcp_passive_fastopen(const struct sock *sk) { return sk->sk_state == TCP_SYN_RECV && rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL; } static inline void fastopen_queue_tune(struct sock *sk, int backlog) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn); WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn)); } static inline void tcp_move_syn(struct tcp_sock *tp, struct request_sock *req) { tp->saved_syn = req->saved_syn; req->saved_syn = NULL; } static inline void tcp_saved_syn_free(struct tcp_sock *tp) { kfree(tp->saved_syn); tp->saved_syn = NULL; } static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { /* We use READ_ONCE() here because socket might not be locked. * This happens for listeners. */ u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); return (user_mss && user_mss < mss) ? user_mss : mss; } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void __tcp_sock_set_cork(struct sock *sk, bool on); void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle_locked(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); void __tcp_sock_set_nodelay(struct sock *sk, bool on); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); int tcp_sock_set_user_timeout(struct sock *sk, int val); static inline bool dst_tcp_usec_ts(const struct dst_entry *dst) { return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS); } #endif /* _LINUX_TCP_H */
40 3 37 47 12 12 11 2 1 1 4 3 3 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 // SPDX-License-Identifier: GPL-2.0-or-later /* user_defined.c: user defined key type * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/err.h> #include <keys/user-type.h> #include <linux/uaccess.h> #include "internal.h" static int logon_vet_description(const char *desc); /* * user defined keys take an arbitrary string as the description and an * arbitrary blob of data as the payload */ struct key_type key_type_user = { .name = "user", .preparse = user_preparse, .free_preparse = user_free_preparse, .instantiate = generic_key_instantiate, .update = user_update, .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, .read = user_read, }; EXPORT_SYMBOL_GPL(key_type_user); /* * This key type is essentially the same as key_type_user, but it does * not define a .read op. This is suitable for storing username and * password pairs in the keyring that you do not want to be readable * from userspace. */ struct key_type key_type_logon = { .name = "logon", .preparse = user_preparse, .free_preparse = user_free_preparse, .instantiate = generic_key_instantiate, .update = user_update, .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, .vet_description = logon_vet_description, }; EXPORT_SYMBOL_GPL(key_type_logon); /* * Preparse a user defined key payload */ int user_preparse(struct key_preparsed_payload *prep) { struct user_key_payload *upayload; size_t datalen = prep->datalen; if (datalen <= 0 || datalen > 32767 || !prep->data) return -EINVAL; upayload = kmalloc(sizeof(*upayload) + datalen, GFP_KERNEL); if (!upayload) return -ENOMEM; /* attach the data */ prep->quotalen = datalen; prep->payload.data[0] = upayload; upayload->datalen = datalen; memcpy(upayload->data, prep->data, datalen); return 0; } EXPORT_SYMBOL_GPL(user_preparse); /* * Free a preparse of a user defined key payload */ void user_free_preparse(struct key_preparsed_payload *prep) { kfree_sensitive(prep->payload.data[0]); } EXPORT_SYMBOL_GPL(user_free_preparse); static void user_free_payload_rcu(struct rcu_head *head) { struct user_key_payload *payload; payload = container_of(head, struct user_key_payload, rcu); kfree_sensitive(payload); } /* * update a user defined key * - the key's semaphore is write-locked */ int user_update(struct key *key, struct key_preparsed_payload *prep) { struct user_key_payload *zap = NULL; int ret; /* check the quota and attach the new data */ ret = key_payload_reserve(key, prep->datalen); if (ret < 0) return ret; /* attach the new data, displacing the old */ key->expiry = prep->expiry; if (key_is_positive(key)) zap = dereference_key_locked(key); rcu_assign_keypointer(key, prep->payload.data[0]); prep->payload.data[0] = NULL; if (zap) call_rcu(&zap->rcu, user_free_payload_rcu); return ret; } EXPORT_SYMBOL_GPL(user_update); /* * dispose of the links from a revoked keyring * - called with the key sem write-locked */ void user_revoke(struct key *key) { struct user_key_payload *upayload = user_key_payload_locked(key); /* clear the quota */ key_payload_reserve(key, 0); if (upayload) { rcu_assign_keypointer(key, NULL); call_rcu(&upayload->rcu, user_free_payload_rcu); } } EXPORT_SYMBOL(user_revoke); /* * dispose of the data dangling from the corpse of a user key */ void user_destroy(struct key *key) { struct user_key_payload *upayload = key->payload.data[0]; kfree_sensitive(upayload); } EXPORT_SYMBOL_GPL(user_destroy); /* * describe the user key */ void user_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); if (key_is_positive(key)) seq_printf(m, ": %u", key->datalen); } EXPORT_SYMBOL_GPL(user_describe); /* * read the key data * - the key's semaphore is read-locked */ long user_read(const struct key *key, char *buffer, size_t buflen) { const struct user_key_payload *upayload; long ret; upayload = user_key_payload_locked(key); ret = upayload->datalen; /* we can return the data as is */ if (buffer && buflen > 0) { if (buflen > upayload->datalen) buflen = upayload->datalen; memcpy(buffer, upayload->data, buflen); } return ret; } EXPORT_SYMBOL_GPL(user_read); /* Vet the description for a "logon" key */ static int logon_vet_description(const char *desc) { char *p; /* require a "qualified" description string */ p = strchr(desc, ':'); if (!p) return -EINVAL; /* also reject description with ':' as first char */ if (p == desc) return -EINVAL; return 0; }
2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0 #include <linux/ptrace.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/export.h> #include <asm/syscall.h> static int collect_syscall(struct task_struct *target, struct syscall_info *info) { unsigned long args[6] = { }; struct pt_regs *regs; if (!try_get_task_stack(target)) { /* Task has no stack, so the task isn't in a syscall. */ memset(info, 0, sizeof(*info)); info->data.nr = -1; return 0; } regs = task_pt_regs(target); if (unlikely(!regs)) { put_task_stack(target); return -EAGAIN; } info->sp = user_stack_pointer(regs); info->data.instruction_pointer = instruction_pointer(regs); info->data.nr = syscall_get_nr(target, regs); if (info->data.nr != -1L) syscall_get_arguments(target, regs, args); info->data.args[0] = args[0]; info->data.args[1] = args[1]; info->data.args[2] = args[2]; info->data.args[3] = args[3]; info->data.args[4] = args[4]; info->data.args[5] = args[5]; put_task_stack(target); return 0; } /** * task_current_syscall - Discover what a blocked task is doing. * @target: thread to examine * @info: structure with the following fields: * .sp - filled with user stack pointer * .data.nr - filled with system call number or -1 * .data.args - filled with @maxargs system call arguments * .data.instruction_pointer - filled with user PC * * If @target is blocked in a system call, returns zero with @info.data.nr * set to the call's number and @info.data.args filled in with its * arguments. Registers not used for system call arguments may not be available * and it is not kosher to use &struct user_regset calls while the system * call is still in progress. Note we may get this result if @target * has finished its system call but not yet returned to user mode, such * as when it's stopped for signal handling or syscall exit tracing. * * If @target is blocked in the kernel during a fault or exception, * returns zero with *@info.data.nr set to -1 and does not fill in * @info.data.args. If so, it's now safe to examine @target using * &struct user_regset get() calls as long as we're sure @target won't return * to user mode. * * Returns -%EAGAIN if @target does not remain blocked. */ int task_current_syscall(struct task_struct *target, struct syscall_info *info) { unsigned long ncsw; unsigned int state; if (target == current) return collect_syscall(target, info); state = READ_ONCE(target->__state); if (unlikely(!state)) return -EAGAIN; ncsw = wait_task_inactive(target, state); if (unlikely(!ncsw) || unlikely(collect_syscall(target, info)) || unlikely(wait_task_inactive(target, state) != ncsw)) return -EAGAIN; return 0; }
1 1 1 12 12 11 11 97 99 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _CCID_H #define _CCID_H /* * net/dccp/ccid.h * * An implementation of the DCCP protocol * Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * CCID infrastructure */ #include <net/sock.h> #include <linux/compiler.h> #include <linux/dccp.h> #include <linux/list.h> #include <linux/module.h> /* maximum value for a CCID (RFC 4340, 19.5) */ #define CCID_MAX 255 #define CCID_SLAB_NAME_LENGTH 32 struct tcp_info; /** * struct ccid_operations - Interface to Congestion-Control Infrastructure * * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) * @ccid_name: alphabetical identifier string for @ccid_id * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket * * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) * @ccid_hc_rx_packet_recv: implements the HC-receiver side * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender */ struct ccid_operations { unsigned char ccid_id; __u32 ccid_ccmps; const char *ccid_name; struct kmem_cache *ccid_hc_rx_slab, *ccid_hc_tx_slab; char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; __u32 ccid_hc_rx_obj_size, ccid_hc_tx_obj_size; /* Interface Routines */ int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); void (*ccid_hc_rx_exit)(struct sock *sk); void (*ccid_hc_tx_exit)(struct sock *sk); void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len); int (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, unsigned int len); void (*ccid_hc_rx_get_info)(struct sock *sk, struct tcp_info *info); void (*ccid_hc_tx_get_info)(struct sock *sk, struct tcp_info *info); int (*ccid_hc_rx_getsockopt)(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen); int (*ccid_hc_tx_getsockopt)(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen); }; extern struct ccid_operations ccid2_ops; #ifdef CONFIG_IP_DCCP_CCID3 extern struct ccid_operations ccid3_ops; #endif int ccid_initialize_builtins(void); void ccid_cleanup_builtins(void); struct ccid { struct ccid_operations *ccid_ops; char ccid_priv[]; }; static inline void *ccid_priv(const struct ccid *ccid) { return (void *)ccid->ccid_priv; } bool ccid_support_check(u8 const *ccid_array, u8 array_len); int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, char __user *, int __user *); struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_rx_ccid; if (ccid == NULL || ccid->ccid_ops == NULL) return -1; return ccid->ccid_ops->ccid_id; } static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_tx_ccid; if (ccid == NULL || ccid->ccid_ops == NULL) return -1; return ccid->ccid_ops->ccid_id; } void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); /* * Congestion control of queued data packets via CCID decision. * * The TX CCID performs its congestion-control by indicating whether and when a * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). * The following modes are supported via the symbolic constants below: * - timer-based pacing (CCID returns a delay value in milliseconds); * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). */ enum ccid_dequeueing_decision { CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ CCID_PACKET_ERR = 0xF0000, /* error condition */ }; static inline int ccid_packet_dequeue_eval(const int return_code) { if (return_code < 0) return CCID_PACKET_ERR; if (return_code == 0) return CCID_PACKET_SEND_AT_ONCE; if (return_code <= CCID_PACKET_DELAY_MAX) return CCID_PACKET_DELAY; return return_code; } static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); return CCID_PACKET_SEND_AT_ONCE; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, unsigned int len) { if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); } static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); } static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); } /** * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) * @val: value of @opt * @len: length of @val in bytes */ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len) { if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) return 0; return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); } /** * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender * Arguments are analogous to ccid_hc_tx_parse_options() */ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, u8 pkt, u8 opt, u8 *val, u8 len) { if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) return 0; return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); } static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); return 0; } static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, struct tcp_info *info) { if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); } static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, struct tcp_info *info) { if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); } static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, optval, optlen); return rc; } static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, optval, optlen); return rc; } #endif /* _CCID_H */
22 7 69 69 53 2 52 1 3 2 5 2 19 1 5 5 29 5 12 12 2 4 3 2 3 2 2 2 2 1 1 2 1 3 45 1 28 14 20 9 2 7 4 4 20 58 27 25 6 32 26 20 20 37 37 39 30 9 3 10 2 10 4 10 3 6 32 2 37 37 37 37 37 37 37 36 97 71 18 10 1090 1054 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 // SPDX-License-Identifier: GPL-2.0-or-later /* * Extension Header handling for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Andi Kleen <ak@muc.de> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ /* Changes: * yoshfuji : ensure not to overrun while parsing * tlv options. * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). * YOSHIFUJI Hideaki @USAGI Register inbound extension header * handlers as inet6_protocol{}. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/slab.h> #include <linux/export.h> #include <net/dst.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/calipso.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/xfrm.h> #endif #include <linux/seg6.h> #include <net/seg6.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif #include <net/rpl.h> #include <linux/ioam6.h> #include <net/ioam6.h> #include <net/dst_metadata.h> #include <linux/uaccess.h> /********************* Generic functions *********************/ /* An unknown option is detected, decide what to do */ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff, bool disallow_unknowns) { if (disallow_unknowns) { /* If unknown TLVs are disallowed by configuration * then always silently drop packet. Note this also * means no ICMP parameter problem is sent which * could be a good property to mitigate a reflection DOS * attack. */ goto drop; } switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ return true; case 1: /* drop packet */ break; case 3: /* Send ICMP if not a multicast address and drop packet */ /* Actually, it is redundant check. icmp_send will recheck in any case. */ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; fallthrough; case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob_reason(skb, ICMPV6_UNK_OPTION, optoff, SKB_DROP_REASON_UNHANDLED_PROTO); return false; } drop: kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); return false; } static bool ipv6_hop_ra(struct sk_buff *skb, int optoff); static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff); static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff); static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff); #if IS_ENABLED(CONFIG_IPV6_MIP6) static bool ipv6_dest_hao(struct sk_buff *skb, int optoff); #endif /* Parse tlv encoded option header (hop-by-hop or destination) */ static bool ip6_parse_tlv(bool hopbyhop, struct sk_buff *skb, int max_count) { int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); bool disallow_unknowns = false; int tlv_count = 0; int padlen = 0; if (unlikely(max_count < 0)) { disallow_unknowns = true; max_count = -max_count; } off += 2; len -= 2; while (len > 0) { int optlen, i; if (nh[off] == IPV6_TLV_PAD1) { padlen++; if (padlen > 7) goto bad; off++; len--; continue; } if (len < 2) goto bad; optlen = nh[off + 1] + 2; if (optlen > len) goto bad; if (nh[off] == IPV6_TLV_PADN) { /* RFC 2460 states that the purpose of PadN is * to align the containing header to multiples * of 8. 7 is therefore the highest valid value. * See also RFC 4942, Section 2.1.9.5. */ padlen += optlen; if (padlen > 7) goto bad; /* RFC 4942 recommends receiving hosts to * actively check PadN payload to contain * only zeroes. */ for (i = 2; i < optlen; i++) { if (nh[off + i] != 0) goto bad; } } else { tlv_count++; if (tlv_count > max_count) goto bad; if (hopbyhop) { switch (nh[off]) { case IPV6_TLV_ROUTERALERT: if (!ipv6_hop_ra(skb, off)) return false; break; case IPV6_TLV_IOAM: if (!ipv6_hop_ioam(skb, off)) return false; break; case IPV6_TLV_JUMBO: if (!ipv6_hop_jumbo(skb, off)) return false; break; case IPV6_TLV_CALIPSO: if (!ipv6_hop_calipso(skb, off)) return false; break; default: if (!ip6_tlvopt_unknown(skb, off, disallow_unknowns)) return false; break; } } else { switch (nh[off]) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_TLV_HAO: if (!ipv6_dest_hao(skb, off)) return false; break; #endif default: if (!ip6_tlvopt_unknown(skb, off, disallow_unknowns)) return false; break; } } padlen = 0; } off += optlen; len -= optlen; } if (len == 0) return true; bad: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /***************************** Destination options header. *****************************/ #if IS_ENABLED(CONFIG_IPV6_MIP6) static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) { struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); SKB_DR(reason); int ret; if (opt->dsthao) { net_dbg_ratelimited("hao duplicated\n"); goto discard; } opt->dsthao = opt->dst1; opt->dst1 = 0; hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); if (hao->length != 16) { net_dbg_ratelimited("hao invalid option length = %d\n", hao->length); SKB_DR_SET(reason, IP_INHDR); goto discard; } if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) { net_dbg_ratelimited("hao is not an unicast addr: %pI6\n", &hao->addr); SKB_DR_SET(reason, INVALID_PROTO); goto discard; } ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr, (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS); if (unlikely(ret < 0)) { SKB_DR_SET(reason, XFRM_POLICY); goto discard; } if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) goto discard; /* update all variable using below by copied skbuff */ hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); ipv6h = ipv6_hdr(skb); } if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; swap(ipv6h->saddr, hao->addr); if (skb->tstamp == 0) __net_timestamp(skb); return true; discard: kfree_skb_reason(skb, reason); return false; } #endif static int ipv6_destopt_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; #endif struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); int extlen; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(dev_net(dst->dev), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); return -1; } extlen = (skb_transport_header(skb)[1] + 1) << 3; if (extlen > net->ipv6.sysctl.max_dst_opts_len) goto fail_and_free; opt->lastopt = opt->dst1 = skb_network_header_len(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; #else opt->nhoff = opt->dst1; #endif return 1; } __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return -1; } static void seg6_update_csum(struct sk_buff *skb) { struct ipv6_sr_hdr *hdr; struct in6_addr *addr; __be32 from, to; /* srh is at transport offset and seg_left is already decremented * but daddr is not yet updated with next segment */ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); addr = hdr->segments + hdr->segments_left; hdr->segments_left++; from = *(__be32 *)hdr; hdr->segments_left--; to = *(__be32 *)hdr; /* update skb csum with diff resulting from seg_left decrement */ update_csum_diff4(skb, from, to); /* compute csum diff between current and next segment and update */ update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr), (__be32 *)addr); } static int ipv6_srh_rcv(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); struct ipv6_sr_hdr *hdr; struct inet6_dev *idev; struct in6_addr *addr; int accept_seg6; hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); idev = __in6_dev_get(skb->dev); accept_seg6 = net->ipv6.devconf_all->seg6_enabled; if (accept_seg6 > idev->cnf.seg6_enabled) accept_seg6 = idev->cnf.seg6_enabled; if (!accept_seg6) { kfree_skb(skb); return -1; } #ifdef CONFIG_IPV6_SEG6_HMAC if (!seg6_hmac_validate_skb(skb)) { kfree_skb(skb); return -1; } #endif looped_back: if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; if (hdr->nexthdr == NEXTHDR_IPV4) skb->protocol = htons(ETH_P_IP); __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); return -1; } opt->srcrt = skb_network_header_len(skb); opt->lastopt = opt->srcrt; skb->transport_header += (hdr->hdrlen + 1) << 3; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } if (hdr->segments_left >= (hdr->hdrlen >> 1)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); } hdr->segments_left--; addr = hdr->segments + hdr->segments_left; skb_push(skb, sizeof(struct ipv6hdr)); if (skb->ip_summed == CHECKSUM_COMPLETE) seg6_update_csum(skb); ipv6_hdr(skb)->daddr = *addr; ip6_route_input(skb); if (skb_dst(skb)->error) { dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; skb_pull(skb, sizeof(struct ipv6hdr)); goto looped_back; } dst_input(skb); return -1; } static int ipv6_rpl_srh_rcv(struct sk_buff *skb) { struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr; struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); struct inet6_dev *idev; struct ipv6hdr *oldhdr; unsigned char *buf; int accept_rpl_seg; int i, err; u64 n = 0; u32 r; idev = __in6_dev_get(skb->dev); accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled; if (accept_rpl_seg > idev->cnf.rpl_seg_enabled) accept_rpl_seg = idev->cnf.rpl_seg_enabled; if (!accept_rpl_seg) { kfree_skb(skb); return -1; } looped_back: hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb); if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); return -1; } opt->srcrt = skb_network_header_len(skb); opt->lastopt = opt->srcrt; skb->transport_header += (hdr->hdrlen + 1) << 3; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre); r = do_div(n, (16 - hdr->cmpri)); /* checks if calculation was without remainder and n fits into * unsigned char which is segments_left field. Should not be * higher than that. */ if (r || (n + 1) > 255) { kfree_skb(skb); return -1; } if (hdr->segments_left > n + 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } hdr->segments_left--; i = n - hdr->segments_left; buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC); if (unlikely(!buf)) { kfree_skb(skb); return -1; } ohdr = (struct ipv6_rpl_sr_hdr *)buf; ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n); chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3)); if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) { kfree_skb(skb); kfree(buf); return -1; } err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1); if (err) { icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0); kfree_skb(skb); kfree(buf); return -1; } swap(ipv6_hdr(skb)->daddr, ohdr->rpl_segaddr[i]); ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n); oldhdr = ipv6_hdr(skb); skb_pull(skb, ((hdr->hdrlen + 1) << 3)); skb_postpull_rcsum(skb, oldhdr, sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3)); if (unlikely(!hdr->segments_left)) { if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); kfree(buf); return -1; } oldhdr = ipv6_hdr(skb); } skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr)); memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3); ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, ipv6_hdr(skb), sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3)); kfree(buf); ip6_route_input(skb); if (skb_dst(skb)->error) { dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; skb_pull(skb, sizeof(struct ipv6hdr)); goto looped_back; } dst_input(skb); return -1; } /******************************** Routing header. ********************************/ /* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; struct net *net = dev_net(skb->dev); int accept_source_route = net->ipv6.devconf_all->accept_source_route; if (idev && accept_source_route > idev->cnf.accept_source_route) accept_source_route = idev->cnf.accept_source_route; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } switch (hdr->type) { case IPV6_SRCRT_TYPE_4: /* segment routing */ return ipv6_srh_rcv(skb); case IPV6_SRCRT_TYPE_3: /* rpl segment routing */ return ipv6_rpl_srh_rcv(skb); default: break; } looped_back: if (hdr->segments_left == 0) { switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: /* Silently discard type 2 header unless it was * processed by own */ if (!addr) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } break; #endif default: break; } opt->lastopt = opt->srcrt = skb_network_header_len(skb); skb->transport_header += (hdr->hdrlen + 1) << 3; opt->dst0 = opt->dst1; opt->dst1 = 0; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (accept_source_route < 0) goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } break; #endif default: goto unknown_rh; } /* * This is the routing header forwarding algorithm from * RFC 2460, page 16. */ n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } /* We are about to mangle packet header. Be careful! Do not damage packets queued somewhere. */ if (skb_cloned(skb)) { /* the copy is a forwarded packet */ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); } if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; i = n - --hdr->segments_left; rthdr = (struct rt0_hdr *) hdr; addr = rthdr->addr; addr += i - 1; switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } break; #endif default: break; } if (ipv6_addr_is_multicast(addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } swap(*addr, ipv6_hdr(skb)->daddr); ip6_route_input(skb); if (skb_dst(skb)->error) { skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; goto looped_back; } skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; unknown_rh: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; } static const struct inet6_protocol rthdr_protocol = { .handler = ipv6_rthdr_rcv, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol destopt_protocol = { .handler = ipv6_destopt_rcv, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol nodata_protocol = { .handler = dst_discard, .flags = INET6_PROTO_NOPOLICY, }; int __init ipv6_exthdrs_init(void) { int ret; ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING); if (ret) goto out; ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS); if (ret) goto out_rthdr; ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE); if (ret) goto out_destopt; out: return ret; out_destopt: inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); out_rthdr: inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); goto out; }; void ipv6_exthdrs_exit(void) { inet6_del_protocol(&nodata_protocol, IPPROTO_NONE); inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); } /********************************** Hop-by-hop options. **********************************/ /* * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). */ static inline struct net *ipv6_skb_net(struct sk_buff *skb) { return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); } /* Router Alert as of RFC 2711 */ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] == 2) { IP6CB(skb)->flags |= IP6SKB_ROUTERALERT; memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra)); return true; } net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n", nh[optoff + 1]); kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /* IOAM */ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) { struct ioam6_trace_hdr *trace; struct ioam6_namespace *ns; struct ioam6_hdr *hdr; /* Bad alignment (must be 4n-aligned) */ if (optoff & 3) goto drop; /* Ignore if IOAM is not enabled on ingress */ if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled) goto ignore; /* Truncated Option header */ hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); if (hdr->opt_len < 2) goto drop; switch (hdr->type) { case IOAM6_TYPE_PREALLOC: /* Truncated Pre-allocated Trace header */ if (hdr->opt_len < 2 + sizeof(*trace)) goto drop; /* Malformed Pre-allocated Trace header */ trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) goto drop; /* Ignore if the IOAM namespace is unknown */ ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id); if (!ns) goto ignore; if (!skb_valid_dst(skb)) ip6_route_input(skb); ioam6_fill_trace_data(skb, ns, trace, true); break; default: break; } ignore: return true; drop: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /* Jumbo payload */ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); SKB_DR(reason); u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); SKB_DR_SET(reason, IP_INHDR); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff + 2, SKB_DROP_REASON_IP_INHDR); return false; } if (ipv6_hdr(skb)->payload_len) { icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff, SKB_DROP_REASON_IP_INHDR); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { SKB_DR_SET(reason, PKT_TOO_SMALL); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM; return true; drop: kfree_skb_reason(skb, reason); return false; } /* CALIPSO RFC 5570 */ static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] < 8) goto drop; if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1]) goto drop; if (!calipso_validate(skb, nh + optoff)) goto drop; return true; drop: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); int extlen; /* * skb_network_header(skb) is equal to skb->data, and * skb_network_header_len(skb) is always equal to * sizeof(struct ipv6hdr) by definition of * hop-by-hop options. */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + ((skb_transport_header(skb)[1] + 1) << 3)))) { fail_and_free: kfree_skb(skb); return -1; } extlen = (skb_transport_header(skb)[1] + 1) << 3; if (extlen > net->ipv6.sysctl.max_hbh_opts_len) goto fail_and_free; opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); return 1; } return -1; } /* * Creating outbound headers. * * "build" functions work when skb is filled from head to tail (datagram) * "push" functions work when headers are added from tail to head (tcp) * * In both cases we assume, that caller reserved enough room * for headers. */ static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { struct rt0_hdr *phdr, *ihdr; int hops; ihdr = (struct rt0_hdr *) opt; phdr = skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); hops = ihdr->rt_hdr.hdrlen >> 1; if (hops > 1) memcpy(phdr->addr, ihdr->addr + 1, (hops - 1) * sizeof(struct in6_addr)); phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; phdr->rt_hdr.nexthdr = *proto; *proto = NEXTHDR_ROUTING; } static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { struct ipv6_sr_hdr *sr_phdr, *sr_ihdr; int plen, hops; sr_ihdr = (struct ipv6_sr_hdr *)opt; plen = (sr_ihdr->hdrlen + 1) << 3; sr_phdr = skb_push(skb, plen); memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr)); hops = sr_ihdr->first_segment + 1; memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1, (hops - 1) * sizeof(struct in6_addr)); sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; if (sr_ihdr->hdrlen > hops * 2) { int tlvs_offset, tlvs_length; tlvs_offset = (1 + hops * 2) << 3; tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; memcpy((char *)sr_phdr + tlvs_offset, (char *)sr_ihdr + tlvs_offset, tlvs_length); } #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; if (skb->dev) net = dev_net(skb->dev); else if (skb->sk) net = sock_net(skb->sk); WARN_ON(!net); if (net) seg6_push_hmac(net, saddr, sr_phdr); } #endif sr_phdr->nexthdr = *proto; *proto = NEXTHDR_ROUTING; } static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { switch (opt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); break; case IPV6_SRCRT_TYPE_4: ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr); break; default: break; } } static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) { struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt)); memcpy(h, opt, ipv6_optlen(opt)); h->nexthdr = *proto; *proto = type; } void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr, struct in6_addr *saddr) { if (opt->srcrt) { ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr); /* * IPV6_RTHDRDSTOPTS is ignored * unless IPV6_RTHDR is set (RFC3542). */ if (opt->dst0opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); } if (opt->hopopt) ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); } void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) { if (opt->dst1opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); } EXPORT_SYMBOL(ipv6_push_frag_opts); struct ipv6_txoptions * ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) *((char **)&opt2->dst0opt) += dif; if (opt2->dst1opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; refcount_set(&opt2->refcnt, 1); } return opt2; } EXPORT_SYMBOL_GPL(ipv6_dup_options); static void ipv6_renew_option(int renewtype, struct ipv6_opt_hdr **dest, struct ipv6_opt_hdr *old, struct ipv6_opt_hdr *new, int newtype, char **p) { struct ipv6_opt_hdr *src; src = (renewtype == newtype ? new : old); if (!src) return; memcpy(*p, src, ipv6_optlen(src)); *dest = (struct ipv6_opt_hdr *)*p; *p += CMSG_ALIGN(ipv6_optlen(*dest)); } /** * ipv6_renew_options - replace a specific ext hdr with a new one. * * @sk: sock from which to allocate memory * @opt: original options * @newtype: option type to replace in @opt * @newopt: new option of type @newtype to replace (user-mem) * * Returns a new set of options which is a copy of @opt with the * option type @newtype replaced with @newopt. * * @opt may be NULL, in which case a new set of options is returned * containing just @newopt. * * @newopt may be NULL, in which case the specified option type is * not copied into the new set of options. * * The new set of options is allocated from the socket option memory * buffer of @sk. */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt) { int tot_len = 0; char *p; struct ipv6_txoptions *opt2; if (opt) { if (newtype != IPV6_HOPOPTS && opt->hopopt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt)); if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt)); if (newtype != IPV6_RTHDR && opt->srcrt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt)); if (newtype != IPV6_DSTOPTS && opt->dst1opt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); } if (newopt) tot_len += CMSG_ALIGN(ipv6_optlen(newopt)); if (!tot_len) return NULL; tot_len += sizeof(*opt2); opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC); if (!opt2) return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); refcount_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, (opt ? opt->hopopt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, (opt ? opt->dst0opt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_RTHDR, (struct ipv6_opt_hdr **)&opt2->srcrt, (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, (opt ? opt->dst1opt : NULL), newopt, newtype, &p); opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0); opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); return opt2; } struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { /* * ignore the dest before srcrt unless srcrt is being included. * --yoshfuji */ if (opt->dst0opt && !opt->srcrt) { if (opt_space != opt) { memcpy(opt_space, opt, sizeof(*opt_space)); opt = opt_space; } opt->opt_nflen -= ipv6_optlen(opt->dst0opt); opt->dst0opt = NULL; } return opt; } EXPORT_SYMBOL_GPL(__ipv6_fixup_options); /** * fl6_update_dst - update flowi destination address with info given * by srcrt option, if any. * * @fl6: flowi6 for which daddr is to be updated * @opt: struct ipv6_txoptions in which to look for srcrt opt * @orig: copy of original daddr address if modified * * Returns NULL if no txoptions or no srcrt, otherwise returns orig * and initial value of fl6->daddr set in orig */ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig) { if (!opt || !opt->srcrt) return NULL; *orig = fl6->daddr; switch (opt->srcrt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; break; case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; fl6->daddr = srh->segments[srh->segments_left]; break; } default: return NULL; } return orig; } EXPORT_SYMBOL_GPL(fl6_update_dst);
3 3 3 3 6 1 4 1 5 5 1 4 1 5 4 1 5 5 5 3 3 3 2 5 4 4 4 4 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2013 Cisco Systems, Inc, 2013. * * Author: Vijay Subramanian <vijaynsu@cisco.com> * Author: Mythili Prabhu <mysuryan@cisco.com> * * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> * University of Oslo, Norway. * * References: * RFC 8033: https://tools.ietf.org/html/rfc8033 */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/pie.h> /* private data for the Qdisc */ struct pie_sched_data { struct pie_vars vars; struct pie_params params; struct pie_stats stats; struct timer_list adapt_timer; struct Qdisc *sch; }; bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, struct pie_vars *vars, u32 backlog, u32 packet_size) { u64 rnd; u64 local_prob = vars->prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ if (vars->burst_time > 0) return false; /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ if ((vars->qdelay < params->target / 2) && (vars->prob < MAX_PROB / 5)) return false; /* If we have fewer than 2 mtu-sized packets, disable pie_drop_early, * similar to min_th in RED */ if (backlog < 2 * mtu) return false; /* If bytemode is turned on, use packet size to compute new * probablity. Smaller packets will have lower drop prob in this case */ if (params->bytemode && packet_size <= mtu) local_prob = (u64)packet_size * div_u64(local_prob, mtu); else local_prob = vars->prob; if (local_prob == 0) vars->accu_prob = 0; else vars->accu_prob += local_prob; if (vars->accu_prob < (MAX_PROB / 100) * 85) return false; if (vars->accu_prob >= (MAX_PROB / 2) * 17) return true; get_random_bytes(&rnd, 8); if ((rnd >> BITS_PER_BYTE) < local_prob) { vars->accu_prob = 0; return true; } return false; } EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; if (unlikely(qdisc_qlen(sch) >= sch->limit)) { q->stats.overlimit++; goto out; } if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, skb->len)) { enqueue = true; } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && INET_ECN_set_ce(skb)) { /* If packet is ecn capable, mark it if drop probability * is lower than 10%, else drop it. */ q->stats.ecn_mark++; enqueue = true; } /* we can enqueue the packet */ if (enqueue) { /* Set enqueue time only when dq_rate_estimator is disabled. */ if (!q->params.dq_rate_estimator) pie_set_enqueue_time(skb); q->stats.packets_in++; if (qdisc_qlen(sch) > q->stats.maxq) q->stats.maxq = qdisc_qlen(sch); return qdisc_enqueue_tail(skb, sch); } out: q->stats.dropped++; q->vars.accu_prob = 0; return qdisc_drop(skb, sch, to_free); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { [TCA_PIE_TARGET] = {.type = NLA_U32}, [TCA_PIE_LIMIT] = {.type = NLA_U32}, [TCA_PIE_TUPDATE] = {.type = NLA_U32}, [TCA_PIE_ALPHA] = {.type = NLA_U32}, [TCA_PIE_BETA] = {.type = NLA_U32}, [TCA_PIE_ECN] = {.type = NLA_U32}, [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, [TCA_PIE_DQ_RATE_ESTIMATOR] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, NULL); if (err < 0) return err; sch_tree_lock(sch); /* convert from microseconds to pschedtime */ if (tb[TCA_PIE_TARGET]) { /* target is in us */ u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); /* convert to pschedtime */ q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); } /* tupdate is in jiffies */ if (tb[TCA_PIE_TUPDATE]) q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); if (tb[TCA_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); q->params.limit = limit; sch->limit = limit; } if (tb[TCA_PIE_ALPHA]) q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); if (tb[TCA_PIE_BETA]) q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); if (tb[TCA_PIE_ECN]) q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); if (tb[TCA_PIE_BYTEMODE]) q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); if (tb[TCA_PIE_DQ_RATE_ESTIMATOR]) q->params.dq_rate_estimator = nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]); /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; } void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, struct pie_vars *vars, u32 backlog) { psched_time_t now = psched_get_time(); u32 dtime = 0; /* If dq_rate_estimator is disabled, calculate qdelay using the * packet timestamp. */ if (!params->dq_rate_estimator) { vars->qdelay = now - pie_get_enqueue_time(skb); if (vars->dq_tstamp != DTIME_INVALID) dtime = now - vars->dq_tstamp; vars->dq_tstamp = now; if (backlog == 0) vars->qdelay = 0; if (dtime == 0) return; goto burst_allowance_reduction; } /* If current queue is about 10 packets or more and dq_count is unset * we have enough packets to calculate the drain rate. Save * current time as dq_tstamp and start measurement cycle. */ if (backlog >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) { vars->dq_tstamp = psched_get_time(); vars->dq_count = 0; } /* Calculate the average drain rate from this value. If queue length * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset * the dq_count to -1 as we don't have enough packets to calculate the * drain rate anymore. The following if block is entered only when we * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) * and we calculate the drain rate for the threshold here. dq_count is * in bytes, time difference in psched_time, hence rate is in * bytes/psched_time. */ if (vars->dq_count != DQCOUNT_INVALID) { vars->dq_count += skb->len; if (vars->dq_count >= QUEUE_THRESHOLD) { u32 count = vars->dq_count << PIE_SCALE; dtime = now - vars->dq_tstamp; if (dtime == 0) return; count = count / dtime; if (vars->avg_dq_rate == 0) vars->avg_dq_rate = count; else vars->avg_dq_rate = (vars->avg_dq_rate - (vars->avg_dq_rate >> 3)) + (count >> 3); /* If the queue has receded below the threshold, we hold * on to the last drain rate calculated, else we reset * dq_count to 0 to re-enter the if block when the next * packet is dequeued */ if (backlog < QUEUE_THRESHOLD) { vars->dq_count = DQCOUNT_INVALID; } else { vars->dq_count = 0; vars->dq_tstamp = psched_get_time(); } goto burst_allowance_reduction; } } return; burst_allowance_reduction: if (vars->burst_time > 0) { if (vars->burst_time > dtime) vars->burst_time -= dtime; else vars->burst_time = 0; } } EXPORT_SYMBOL_GPL(pie_process_dequeue); void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, u32 backlog) { psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = 0; /* in pschedtime */ s64 delta = 0; /* determines the change in probability */ u64 oldprob; u64 alpha, beta; u32 power; bool update_prob = true; if (params->dq_rate_estimator) { qdelay_old = vars->qdelay; vars->qdelay_old = vars->qdelay; if (vars->avg_dq_rate > 0) qdelay = (backlog << PIE_SCALE) / vars->avg_dq_rate; else qdelay = 0; } else { qdelay = vars->qdelay; qdelay_old = vars->qdelay_old; } /* If qdelay is zero and backlog is not, it means backlog is very small, * so we do not update probability in this round. */ if (qdelay == 0 && backlog != 0) update_prob = false; /* In the algorithm, alpha and beta are between 0 and 2 with typical * value for alpha as 0.125. In this implementation, we use values 0-32 * passed from user space to represent this. Also, alpha and beta have * unit of HZ and need to be scaled before they can used to update * probability. alpha/beta are updated locally below by scaling down * by 16 to come to 0-2 range. */ alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; /* We scale alpha and beta differently depending on how heavy the * congestion is. Please see RFC 8033 for details. */ if (vars->prob < MAX_PROB / 10) { alpha >>= 1; beta >>= 1; power = 100; while (vars->prob < div_u64(MAX_PROB, power) && power <= 1000000) { alpha >>= 2; beta >>= 2; power *= 10; } } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ delta += alpha * (qdelay - params->target); delta += beta * (qdelay - qdelay_old); oldprob = vars->prob; /* to ensure we increase probability in steps of no more than 2% */ if (delta > (s64)(MAX_PROB / (100 / 2)) && vars->prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; /* Non-linear drop: * Tune drop probability to increase quickly for high delays(>= 250ms) * 250ms is derived through experiments and provides error protection */ if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); vars->prob += delta; if (delta > 0) { /* prevent overflow */ if (vars->prob < oldprob) { vars->prob = MAX_PROB; /* Prevent normalization error. If probability is at * maximum value already, we normalize it here, and * skip the check to do a non-linear drop in the next * section. */ update_prob = false; } } else { /* prevent underflow */ if (vars->prob > oldprob) vars->prob = 0; } /* Non-linear drop in probability: Reduce drop probability quickly if * delay is 0 for 2 consecutive Tupdate periods. */ if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ vars->prob -= vars->prob / 64; vars->qdelay = qdelay; vars->backlog_old = backlog; /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods * 2. Calculated drop probability is zero * 3. If average dq_rate_estimator is enabled, we have at least one * estimate for the avg_dq_rate ie., is a non-zero value */ if ((vars->qdelay < params->target / 2) && (vars->qdelay_old < params->target / 2) && vars->prob == 0 && (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) { pie_vars_init(vars); } if (!params->dq_rate_estimator) vars->qdelay_old = qdelay; } EXPORT_SYMBOL_GPL(pie_calculate_probability); static void pie_timer(struct timer_list *t) { struct pie_sched_data *q = from_timer(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; rcu_read_lock(); root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog); /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ if (q->params.tupdate) mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); spin_unlock(root_lock); rcu_read_unlock(); } static int pie_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct pie_sched_data *q = qdisc_priv(sch); pie_params_init(&q->params); pie_vars_init(&q->vars); sch->limit = q->params.limit; q->sch = sch; timer_setup(&q->adapt_timer, pie_timer, 0); if (opt) { int err = pie_change(sch, opt, extack); if (err) return err; } mod_timer(&q->adapt_timer, jiffies + HZ / 2); return 0; } static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!opts) goto nla_put_failure; /* convert target from pschedtime to us */ if (nla_put_u32(skb, TCA_PIE_TARGET, ((u32)PSCHED_TICKS2NS(q->params.target)) / NSEC_PER_USEC) || nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) || nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR, q->params.dq_rate_estimator)) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -1; } static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { .prob = q->vars.prob << BITS_PER_BYTE, .delay = ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, .packets_in = q->stats.packets_in, .overlimit = q->stats.overlimit, .maxq = q->stats.maxq, .dropped = q->stats.dropped, .ecn_mark = q->stats.ecn_mark, }; /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ st.dq_rate_estimating = q->params.dq_rate_estimator; /* unscale and return dq_rate in bytes per sec */ if (q->params.dq_rate_estimator) st.avg_dq_rate = q->vars.avg_dq_rate * (PSCHED_TICKS_PER_SEC) >> PIE_SCALE; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = qdisc_dequeue_head(sch); if (!skb) return NULL; pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog); return skb; } static void pie_reset(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); qdisc_reset_queue(sch); pie_vars_init(&q->vars); } static void pie_destroy(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); q->params.tupdate = 0; del_timer_sync(&q->adapt_timer); } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { .id = "pie", .priv_size = sizeof(struct pie_sched_data), .enqueue = pie_qdisc_enqueue, .dequeue = pie_qdisc_dequeue, .peek = qdisc_peek_dequeued, .init = pie_init, .destroy = pie_destroy, .reset = pie_reset, .change = pie_change, .dump = pie_dump, .dump_stats = pie_dump_stats, .owner = THIS_MODULE, }; static int __init pie_module_init(void) { return register_qdisc(&pie_qdisc_ops); } static void __exit pie_module_exit(void) { unregister_qdisc(&pie_qdisc_ops); } module_init(pie_module_init); module_exit(pie_module_exit); MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); MODULE_AUTHOR("Vijay Subramanian"); MODULE_AUTHOR("Mythili Prabhu"); MODULE_LICENSE("GPL");
15 15 15 15 15 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file contains power management functions related to interrupts. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include "internals.h" bool irq_pm_check_wakeup(struct irq_desc *desc) { if (irqd_is_wakeup_armed(&desc->irq_data)) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); pm_system_irq_wakeup(irq_desc_get_irq(desc)); return true; } return false; } /* * Called from __setup_irq() with desc->lock held after @action has * been installed in the action chain. */ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions++; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth++; WARN_ON_ONCE(desc->force_resume_depth && desc->force_resume_depth != desc->nr_actions); if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* * Called from __free_irq() with desc->lock held after @action has * been removed from the action chain. */ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions--; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth--; if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc) { unsigned long chipflags = irq_desc_get_chip(desc)->flags; struct irq_data *irqd = &desc->irq_data; if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(irqd)) { irqd_set(irqd, IRQD_WAKEUP_ARMED); if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && irqd_irq_disabled(irqd)) { /* * Interrupt marked for wakeup is in disabled state. * Enable interrupt here to unmask/enable in irqchip * to be able to resume with such interrupts. */ __enable_irq(desc); irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the * IRQD_WAKEUP_ARMED is visible before we return from * suspend_device_irqs(). */ return true; } desc->istate |= IRQS_SUSPENDED; __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility * requires that the non wakeup interrupts are masked at the * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } /** * suspend_device_irqs - disable all currently enabled interrupt lines * * During system-wide suspend or hibernation device drivers need to be * prevented from receiving interrupts and this function is provided * for this purpose. * * So we disable all interrupts and mark them IRQS_SUSPENDED except * for those which are unused, those which are marked as not * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND * set and those which are marked as active wakeup sources. * * The active wakeup sources are handled by the flow handler entry * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; bool sync; if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); sync = suspend_device_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); if (sync) synchronize_irq(irq); } } static void resume_irq(struct irq_desc *desc) { struct irq_data *irqd = &desc->irq_data; irqd_clear(irqd, IRQD_WAKEUP_ARMED); if (irqd_is_enabled_on_suspend(irqd)) { /* * Interrupt marked for wakeup was enabled during suspend * entry. Disable such interrupts to restore them back to * original state. */ __disable_irq(desc); irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } if (desc->istate & IRQS_SUSPENDED) goto resume; /* Force resume the interrupt? */ if (!desc->force_resume_depth) return; /* Pretend that it got disabled ! */ desc->depth++; irq_state_set_disabled(desc); irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); } static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; raw_spin_lock_irqsave(&desc->lock, flags); resume_irq(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); } } /** * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup * @irq: Interrupt to rearm */ void rearm_wake_irq(unsigned int irq) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return; if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) goto unlock; desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); unlock: irq_put_desc_busunlock(desc, flags); } /** * irq_pm_syscore_resume - enable interrupt lines early * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ static void irq_pm_syscore_resume(void) { resume_irqs(true); } static struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; static int __init irq_pm_init_ops(void) { register_syscore_ops(&irq_pm_syscore_ops); return 0; } device_initcall(irq_pm_init_ops); /** * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag * set as well as those with %IRQF_FORCE_RESUME. */ void resume_device_irqs(void) { resume_irqs(false); }
1 1 1 1 1 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 1 1 2 3 2 1 3 3 33 7 9 1 2 1 2 2 2 2 2 2 2 2 2 19 13 14 1 1 1 1 1 2 1 2 1 1 1 3 3 3 3 4 4 4 3 3 2 2 2 2 2 2 1 2 1 45 46 2 45 31 46 46 2 2 2 2 1 2 3 3 3 3 3 2 1 2 13 44 15 38 37 37 58 37 7 4 38 41 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS emulation layer for the mixer interface * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/string.h> #include <linux/module.h> #include <linux/compat.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/control.h> #include <sound/info.h> #include <sound/mixer_oss.h> #include <linux/soundcard.h> #define OSS_ALSAEMULVER _SIOR ('M', 249, int) MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Mixer OSS emulation for ALSA."); MODULE_LICENSE("GPL"); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_MIXER); static int snd_mixer_oss_open(struct inode *inode, struct file *file) { struct snd_card *card; struct snd_mixer_oss_file *fmixer; int err; err = nonseekable_open(inode, file); if (err < 0) return err; card = snd_lookup_oss_minor_data(iminor(inode), SNDRV_OSS_DEVICE_TYPE_MIXER); if (card == NULL) return -ENODEV; if (card->mixer_oss == NULL) { snd_card_unref(card); return -ENODEV; } err = snd_card_file_add(card, file); if (err < 0) { snd_card_unref(card); return err; } fmixer = kzalloc(sizeof(*fmixer), GFP_KERNEL); if (fmixer == NULL) { snd_card_file_remove(card, file); snd_card_unref(card); return -ENOMEM; } fmixer->card = card; fmixer->mixer = card->mixer_oss; file->private_data = fmixer; if (!try_module_get(card->module)) { kfree(fmixer); snd_card_file_remove(card, file); snd_card_unref(card); return -EFAULT; } snd_card_unref(card); return 0; } static int snd_mixer_oss_release(struct inode *inode, struct file *file) { struct snd_mixer_oss_file *fmixer; if (file->private_data) { fmixer = file->private_data; module_put(fmixer->card->module); snd_card_file_remove(fmixer->card, file); kfree(fmixer); } return 0; } static int snd_mixer_oss_info(struct snd_mixer_oss_file *fmixer, mixer_info __user *_info) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct mixer_info info; memset(&info, 0, sizeof(info)); strscpy(info.id, mixer && mixer->id[0] ? mixer->id : card->driver, sizeof(info.id)); strscpy(info.name, mixer && mixer->name[0] ? mixer->name : card->mixername, sizeof(info.name)); info.modify_counter = card->mixer_oss_change_count; if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_mixer_oss_info_obsolete(struct snd_mixer_oss_file *fmixer, _old_mixer_info __user *_info) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; _old_mixer_info info; memset(&info, 0, sizeof(info)); strscpy(info.id, mixer && mixer->id[0] ? mixer->id : card->driver, sizeof(info.id)); strscpy(info.name, mixer && mixer->name[0] ? mixer->name : card->mixername, sizeof(info.name)); if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static int snd_mixer_oss_caps(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; if (mixer->get_recsrc && mixer->put_recsrc) result |= SOUND_CAP_EXCL_INPUT; return result; } static int snd_mixer_oss_devmask(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, chn; if (mixer == NULL) return -EIO; mutex_lock(&mixer->reg_mutex); for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_volume || pslot->put_recsrc) result |= 1 << chn; } mutex_unlock(&mixer->reg_mutex); return result; } static int snd_mixer_oss_stereodevs(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, chn; if (mixer == NULL) return -EIO; mutex_lock(&mixer->reg_mutex); for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_volume && pslot->stereo) result |= 1 << chn; } mutex_unlock(&mixer->reg_mutex); return result; } static int snd_mixer_oss_recmask(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; mutex_lock(&mixer->reg_mutex); if (mixer->put_recsrc && mixer->get_recsrc) { /* exclusive */ result = mixer->mask_recsrc; } else { struct snd_mixer_oss_slot *pslot; int chn; for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_recsrc) result |= 1 << chn; } } mutex_unlock(&mixer->reg_mutex); return result; } static int snd_mixer_oss_get_recsrc(struct snd_mixer_oss_file *fmixer) { struct snd_mixer_oss *mixer = fmixer->mixer; int result = 0; if (mixer == NULL) return -EIO; mutex_lock(&mixer->reg_mutex); if (mixer->put_recsrc && mixer->get_recsrc) { /* exclusive */ unsigned int index; result = mixer->get_recsrc(fmixer, &index); if (result < 0) goto unlock; result = 1 << index; } else { struct snd_mixer_oss_slot *pslot; int chn; for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->get_recsrc) { int active = 0; pslot->get_recsrc(fmixer, pslot, &active); if (active) result |= 1 << chn; } } } mixer->oss_recsrc = result; unlock: mutex_unlock(&mixer->reg_mutex); return result; } static int snd_mixer_oss_set_recsrc(struct snd_mixer_oss_file *fmixer, int recsrc) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int chn, active; unsigned int index; int result = 0; if (mixer == NULL) return -EIO; mutex_lock(&mixer->reg_mutex); if (mixer->get_recsrc && mixer->put_recsrc) { /* exclusive input */ if (recsrc & ~mixer->oss_recsrc) recsrc &= ~mixer->oss_recsrc; mixer->put_recsrc(fmixer, ffz(~recsrc)); mixer->get_recsrc(fmixer, &index); result = 1 << index; } for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->put_recsrc) { active = (recsrc & (1 << chn)) ? 1 : 0; pslot->put_recsrc(fmixer, pslot, active); } } if (! result) { for (chn = 0; chn < 31; chn++) { pslot = &mixer->slots[chn]; if (pslot->get_recsrc) { active = 0; pslot->get_recsrc(fmixer, pslot, &active); if (active) result |= 1 << chn; } } } mutex_unlock(&mixer->reg_mutex); return result; } static int snd_mixer_oss_get_volume(struct snd_mixer_oss_file *fmixer, int slot) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, left, right; if (mixer == NULL || slot > 30) return -EIO; mutex_lock(&mixer->reg_mutex); pslot = &mixer->slots[slot]; left = pslot->volume[0]; right = pslot->volume[1]; if (pslot->get_volume) result = pslot->get_volume(fmixer, pslot, &left, &right); if (!pslot->stereo) right = left; if (snd_BUG_ON(left < 0 || left > 100)) { result = -EIO; goto unlock; } if (snd_BUG_ON(right < 0 || right > 100)) { result = -EIO; goto unlock; } if (result >= 0) { pslot->volume[0] = left; pslot->volume[1] = right; result = (left & 0xff) | ((right & 0xff) << 8); } unlock: mutex_unlock(&mixer->reg_mutex); return result; } static int snd_mixer_oss_set_volume(struct snd_mixer_oss_file *fmixer, int slot, int volume) { struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_mixer_oss_slot *pslot; int result = 0, left = volume & 0xff, right = (volume >> 8) & 0xff; if (mixer == NULL || slot > 30) return -EIO; mutex_lock(&mixer->reg_mutex); pslot = &mixer->slots[slot]; if (left > 100) left = 100; if (right > 100) right = 100; if (!pslot->stereo) right = left; if (pslot->put_volume) result = pslot->put_volume(fmixer, pslot, left, right); if (result < 0) goto unlock; pslot->volume[0] = left; pslot->volume[1] = right; result = (left & 0xff) | ((right & 0xff) << 8); unlock: mutex_unlock(&mixer->reg_mutex); return result; } static int snd_mixer_oss_ioctl1(struct snd_mixer_oss_file *fmixer, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; int __user *p = argp; int tmp; if (snd_BUG_ON(!fmixer)) return -ENXIO; if (((cmd >> 8) & 0xff) == 'M') { switch (cmd) { case SOUND_MIXER_INFO: return snd_mixer_oss_info(fmixer, argp); case SOUND_OLD_MIXER_INFO: return snd_mixer_oss_info_obsolete(fmixer, argp); case SOUND_MIXER_WRITE_RECSRC: if (get_user(tmp, p)) return -EFAULT; tmp = snd_mixer_oss_set_recsrc(fmixer, tmp); if (tmp < 0) return tmp; return put_user(tmp, p); case OSS_GETVERSION: return put_user(SNDRV_OSS_VERSION, p); case OSS_ALSAEMULVER: return put_user(1, p); case SOUND_MIXER_READ_DEVMASK: tmp = snd_mixer_oss_devmask(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_STEREODEVS: tmp = snd_mixer_oss_stereodevs(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_RECMASK: tmp = snd_mixer_oss_recmask(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_CAPS: tmp = snd_mixer_oss_caps(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); case SOUND_MIXER_READ_RECSRC: tmp = snd_mixer_oss_get_recsrc(fmixer); if (tmp < 0) return tmp; return put_user(tmp, p); } } if (cmd & SIOC_IN) { if (get_user(tmp, p)) return -EFAULT; tmp = snd_mixer_oss_set_volume(fmixer, cmd & 0xff, tmp); if (tmp < 0) return tmp; return put_user(tmp, p); } else if (cmd & SIOC_OUT) { tmp = snd_mixer_oss_get_volume(fmixer, cmd & 0xff); if (tmp < 0) return tmp; return put_user(tmp, p); } return -ENXIO; } static long snd_mixer_oss_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return snd_mixer_oss_ioctl1(file->private_data, cmd, arg); } int snd_mixer_oss_ioctl_card(struct snd_card *card, unsigned int cmd, unsigned long arg) { struct snd_mixer_oss_file fmixer; if (snd_BUG_ON(!card)) return -ENXIO; if (card->mixer_oss == NULL) return -ENXIO; memset(&fmixer, 0, sizeof(fmixer)); fmixer.card = card; fmixer.mixer = card->mixer_oss; return snd_mixer_oss_ioctl1(&fmixer, cmd, arg); } EXPORT_SYMBOL(snd_mixer_oss_ioctl_card); #ifdef CONFIG_COMPAT /* all compatible */ static long snd_mixer_oss_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return snd_mixer_oss_ioctl1(file->private_data, cmd, (unsigned long)compat_ptr(arg)); } #else #define snd_mixer_oss_ioctl_compat NULL #endif /* * REGISTRATION PART */ static const struct file_operations snd_mixer_oss_f_ops = { .owner = THIS_MODULE, .open = snd_mixer_oss_open, .release = snd_mixer_oss_release, .llseek = no_llseek, .unlocked_ioctl = snd_mixer_oss_ioctl, .compat_ioctl = snd_mixer_oss_ioctl_compat, }; /* * utilities */ static long snd_mixer_oss_conv(long val, long omin, long omax, long nmin, long nmax) { long orange = omax - omin, nrange = nmax - nmin; if (orange == 0) return 0; return DIV_ROUND_CLOSEST(nrange * (val - omin), orange) + nmin; } /* convert from alsa native to oss values (0-100) */ static long snd_mixer_oss_conv1(long val, long min, long max, int *old) { if (val == snd_mixer_oss_conv(*old, 0, 100, min, max)) return *old; return snd_mixer_oss_conv(val, min, max, 0, 100); } /* convert from oss to alsa native values */ static long snd_mixer_oss_conv2(long val, long min, long max) { return snd_mixer_oss_conv(val, 0, 100, min, max); } #if 0 static void snd_mixer_oss_recsrce_set(struct snd_card *card, int slot) { struct snd_mixer_oss *mixer = card->mixer_oss; if (mixer) mixer->mask_recsrc |= 1 << slot; } static int snd_mixer_oss_recsrce_get(struct snd_card *card, int slot) { struct snd_mixer_oss *mixer = card->mixer_oss; if (mixer && (mixer->mask_recsrc & (1 << slot))) return 1; return 0; } #endif #define SNDRV_MIXER_OSS_SIGNATURE 0x65999250 #define SNDRV_MIXER_OSS_ITEM_GLOBAL 0 #define SNDRV_MIXER_OSS_ITEM_GSWITCH 1 #define SNDRV_MIXER_OSS_ITEM_GROUTE 2 #define SNDRV_MIXER_OSS_ITEM_GVOLUME 3 #define SNDRV_MIXER_OSS_ITEM_PSWITCH 4 #define SNDRV_MIXER_OSS_ITEM_PROUTE 5 #define SNDRV_MIXER_OSS_ITEM_PVOLUME 6 #define SNDRV_MIXER_OSS_ITEM_CSWITCH 7 #define SNDRV_MIXER_OSS_ITEM_CROUTE 8 #define SNDRV_MIXER_OSS_ITEM_CVOLUME 9 #define SNDRV_MIXER_OSS_ITEM_CAPTURE 10 #define SNDRV_MIXER_OSS_ITEM_COUNT 11 #define SNDRV_MIXER_OSS_PRESENT_GLOBAL (1<<0) #define SNDRV_MIXER_OSS_PRESENT_GSWITCH (1<<1) #define SNDRV_MIXER_OSS_PRESENT_GROUTE (1<<2) #define SNDRV_MIXER_OSS_PRESENT_GVOLUME (1<<3) #define SNDRV_MIXER_OSS_PRESENT_PSWITCH (1<<4) #define SNDRV_MIXER_OSS_PRESENT_PROUTE (1<<5) #define SNDRV_MIXER_OSS_PRESENT_PVOLUME (1<<6) #define SNDRV_MIXER_OSS_PRESENT_CSWITCH (1<<7) #define SNDRV_MIXER_OSS_PRESENT_CROUTE (1<<8) #define SNDRV_MIXER_OSS_PRESENT_CVOLUME (1<<9) #define SNDRV_MIXER_OSS_PRESENT_CAPTURE (1<<10) struct slot { unsigned int signature; unsigned int present; unsigned int channels; unsigned int numid[SNDRV_MIXER_OSS_ITEM_COUNT]; unsigned int capture_item; const struct snd_mixer_oss_assign_table *assigned; unsigned int allocated: 1; }; #define ID_UNKNOWN ((unsigned int)-1) static struct snd_kcontrol *snd_mixer_oss_test_id(struct snd_mixer_oss *mixer, const char *name, int index) { struct snd_card *card = mixer->card; struct snd_ctl_elem_id id; memset(&id, 0, sizeof(id)); id.iface = SNDRV_CTL_ELEM_IFACE_MIXER; strscpy(id.name, name, sizeof(id.name)); id.index = index; return snd_ctl_find_id_locked(card, &id); } static void snd_mixer_oss_get_volume1_vol(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int *left, int *right) { struct snd_ctl_elem_info *uinfo; struct snd_ctl_elem_value *uctl; struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; if (numid == ID_UNKNOWN) return; down_read(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); if (!kctl) { up_read(&card->controls_rwsem); return; } uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) goto __unalloc; if (kctl->info(kctl, uinfo)) goto __unalloc; if (kctl->get(kctl, uctl)) goto __unalloc; if (uinfo->type == SNDRV_CTL_ELEM_TYPE_BOOLEAN && uinfo->value.integer.min == 0 && uinfo->value.integer.max == 1) goto __unalloc; *left = snd_mixer_oss_conv1(uctl->value.integer.value[0], uinfo->value.integer.min, uinfo->value.integer.max, &pslot->volume[0]); if (uinfo->count > 1) *right = snd_mixer_oss_conv1(uctl->value.integer.value[1], uinfo->value.integer.min, uinfo->value.integer.max, &pslot->volume[1]); __unalloc: up_read(&card->controls_rwsem); kfree(uctl); kfree(uinfo); } static void snd_mixer_oss_get_volume1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int *left, int *right, int route) { struct snd_ctl_elem_info *uinfo; struct snd_ctl_elem_value *uctl; struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; if (numid == ID_UNKNOWN) return; down_read(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); if (!kctl) { up_read(&card->controls_rwsem); return; } uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) goto __unalloc; if (kctl->info(kctl, uinfo)) goto __unalloc; if (kctl->get(kctl, uctl)) goto __unalloc; if (!uctl->value.integer.value[0]) { *left = 0; if (uinfo->count == 1) *right = 0; } if (uinfo->count > 1 && !uctl->value.integer.value[route ? 3 : 1]) *right = 0; __unalloc: up_read(&card->controls_rwsem); kfree(uctl); kfree(uinfo); } static int snd_mixer_oss_get_volume1(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *left, int *right) { struct slot *slot = pslot->private_data; *left = *right = 100; if (slot->present & SNDRV_MIXER_OSS_PRESENT_PVOLUME) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GVOLUME) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GLOBAL) { snd_mixer_oss_get_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GLOBAL], left, right); } if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) { snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } return 0; } static void snd_mixer_oss_put_volume1_vol(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int left, int right) { struct snd_ctl_elem_info *uinfo; struct snd_ctl_elem_value *uctl; struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; int res; if (numid == ID_UNKNOWN) return; down_read(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); if (!kctl) { up_read(&card->controls_rwsem); return; } uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) goto __unalloc; if (kctl->info(kctl, uinfo)) goto __unalloc; if (uinfo->type == SNDRV_CTL_ELEM_TYPE_BOOLEAN && uinfo->value.integer.min == 0 && uinfo->value.integer.max == 1) goto __unalloc; uctl->value.integer.value[0] = snd_mixer_oss_conv2(left, uinfo->value.integer.min, uinfo->value.integer.max); if (uinfo->count > 1) uctl->value.integer.value[1] = snd_mixer_oss_conv2(right, uinfo->value.integer.min, uinfo->value.integer.max); res = kctl->put(kctl, uctl); if (res < 0) goto __unalloc; if (res > 0) snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); __unalloc: up_read(&card->controls_rwsem); kfree(uctl); kfree(uinfo); } static void snd_mixer_oss_put_volume1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, unsigned int numid, int left, int right, int route) { struct snd_ctl_elem_info *uinfo; struct snd_ctl_elem_value *uctl; struct snd_kcontrol *kctl; struct snd_card *card = fmixer->card; int res; if (numid == ID_UNKNOWN) return; down_read(&card->controls_rwsem); kctl = snd_ctl_find_numid_locked(card, numid); if (!kctl) { up_read(&card->controls_rwsem); return; } uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) goto __unalloc; if (kctl->info(kctl, uinfo)) goto __unalloc; if (uinfo->count > 1) { uctl->value.integer.value[0] = left > 0 ? 1 : 0; uctl->value.integer.value[route ? 3 : 1] = right > 0 ? 1 : 0; if (route) { uctl->value.integer.value[1] = uctl->value.integer.value[2] = 0; } } else { uctl->value.integer.value[0] = (left > 0 || right > 0) ? 1 : 0; } res = kctl->put(kctl, uctl); if (res < 0) goto __unalloc; if (res > 0) snd_ctl_notify(card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); __unalloc: up_read(&card->controls_rwsem); kfree(uctl); kfree(uinfo); } static int snd_mixer_oss_put_volume1(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int left, int right) { struct slot *slot = pslot->private_data; if (slot->present & SNDRV_MIXER_OSS_PRESENT_PVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PVOLUME], left, right); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CVOLUME) snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GVOLUME) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GVOLUME], left, right); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GLOBAL) { snd_mixer_oss_put_volume1_vol(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GLOBAL], left, right); } if (left || right) { if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); if (slot->present & SNDRV_MIXER_OSS_PRESENT_CROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], left, right, 1); if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } else { if (slot->present & SNDRV_MIXER_OSS_PRESENT_PSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GSWITCH) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GSWITCH], left, right, 0); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_PROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_PROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_CROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], left, right, 1); } else if (slot->present & SNDRV_MIXER_OSS_PRESENT_GROUTE) { snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_GROUTE], left, right, 1); } } return 0; } static int snd_mixer_oss_get_recsrc1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *active) { struct slot *slot = pslot->private_data; int left, right; left = right = 1; snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], &left, &right, 0); *active = (left || right) ? 1 : 0; return 0; } static int snd_mixer_oss_get_recsrc1_route(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int *active) { struct slot *slot = pslot->private_data; int left, right; left = right = 1; snd_mixer_oss_get_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], &left, &right, 1); *active = (left || right) ? 1 : 0; return 0; } static int snd_mixer_oss_put_recsrc1_sw(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int active) { struct slot *slot = pslot->private_data; snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CSWITCH], active, active, 0); return 0; } static int snd_mixer_oss_put_recsrc1_route(struct snd_mixer_oss_file *fmixer, struct snd_mixer_oss_slot *pslot, int active) { struct slot *slot = pslot->private_data; snd_mixer_oss_put_volume1_sw(fmixer, pslot, slot->numid[SNDRV_MIXER_OSS_ITEM_CROUTE], active, active, 1); return 0; } static int snd_mixer_oss_get_recsrc2(struct snd_mixer_oss_file *fmixer, unsigned int *active_index) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *pslot; struct slot *slot; struct snd_ctl_elem_info *uinfo; struct snd_ctl_elem_value *uctl; int err, idx; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) { err = -ENOMEM; goto __free_only; } down_read(&card->controls_rwsem); kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (! kctl) { err = -ENOENT; goto __unlock; } err = kctl->info(kctl, uinfo); if (err < 0) goto __unlock; err = kctl->get(kctl, uctl); if (err < 0) goto __unlock; for (idx = 0; idx < 32; idx++) { if (!(mixer->mask_recsrc & (1 << idx))) continue; pslot = &mixer->slots[idx]; slot = pslot->private_data; if (slot->signature != SNDRV_MIXER_OSS_SIGNATURE) continue; if (!(slot->present & SNDRV_MIXER_OSS_PRESENT_CAPTURE)) continue; if (slot->capture_item == uctl->value.enumerated.item[0]) { *active_index = idx; break; } } err = 0; __unlock: up_read(&card->controls_rwsem); __free_only: kfree(uctl); kfree(uinfo); return err; } static int snd_mixer_oss_put_recsrc2(struct snd_mixer_oss_file *fmixer, unsigned int active_index) { struct snd_card *card = fmixer->card; struct snd_mixer_oss *mixer = fmixer->mixer; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *pslot; struct slot *slot = NULL; struct snd_ctl_elem_info *uinfo; struct snd_ctl_elem_value *uctl; int err; unsigned int idx; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); uctl = kzalloc(sizeof(*uctl), GFP_KERNEL); if (uinfo == NULL || uctl == NULL) { err = -ENOMEM; goto __free_only; } down_read(&card->controls_rwsem); kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (! kctl) { err = -ENOENT; goto __unlock; } err = kctl->info(kctl, uinfo); if (err < 0) goto __unlock; for (idx = 0; idx < 32; idx++) { if (!(mixer->mask_recsrc & (1 << idx))) continue; pslot = &mixer->slots[idx]; slot = pslot->private_data; if (slot->signature != SNDRV_MIXER_OSS_SIGNATURE) continue; if (!(slot->present & SNDRV_MIXER_OSS_PRESENT_CAPTURE)) continue; if (idx == active_index) break; slot = NULL; } if (! slot) goto __unlock; for (idx = 0; idx < uinfo->count; idx++) uctl->value.enumerated.item[idx] = slot->capture_item; err = kctl->put(kctl, uctl); if (err > 0) snd_ctl_notify(fmixer->card, SNDRV_CTL_EVENT_MASK_VALUE, &kctl->id); err = 0; __unlock: up_read(&card->controls_rwsem); __free_only: kfree(uctl); kfree(uinfo); return err; } struct snd_mixer_oss_assign_table { int oss_id; const char *name; int index; }; static int snd_mixer_oss_build_test(struct snd_mixer_oss *mixer, struct slot *slot, const char *name, int index, int item) { struct snd_ctl_elem_info *info; struct snd_kcontrol *kcontrol; struct snd_card *card = mixer->card; int err; down_read(&card->controls_rwsem); kcontrol = snd_mixer_oss_test_id(mixer, name, index); if (kcontrol == NULL) { up_read(&card->controls_rwsem); return 0; } info = kmalloc(sizeof(*info), GFP_KERNEL); if (! info) { up_read(&card->controls_rwsem); return -ENOMEM; } err = kcontrol->info(kcontrol, info); if (err < 0) { up_read(&card->controls_rwsem); kfree(info); return err; } slot->numid[item] = kcontrol->id.numid; up_read(&card->controls_rwsem); if (info->count > slot->channels) slot->channels = info->count; slot->present |= 1 << item; kfree(info); return 0; } static void snd_mixer_oss_slot_free(struct snd_mixer_oss_slot *chn) { struct slot *p = chn->private_data; if (p) { if (p->allocated && p->assigned) { kfree_const(p->assigned->name); kfree_const(p->assigned); } kfree(p); } } static void mixer_slot_clear(struct snd_mixer_oss_slot *rslot) { int idx = rslot->number; /* remember this */ if (rslot->private_free) rslot->private_free(rslot); memset(rslot, 0, sizeof(*rslot)); rslot->number = idx; } /* In a separate function to keep gcc 3.2 happy - do NOT merge this in snd_mixer_oss_build_input! */ static int snd_mixer_oss_build_test_all(struct snd_mixer_oss *mixer, const struct snd_mixer_oss_assign_table *ptr, struct slot *slot) { char str[64]; int err; err = snd_mixer_oss_build_test(mixer, slot, ptr->name, ptr->index, SNDRV_MIXER_OSS_ITEM_GLOBAL); if (err) return err; sprintf(str, "%s Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GSWITCH); if (err) return err; sprintf(str, "%s Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GROUTE); if (err) return err; sprintf(str, "%s Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_GVOLUME); if (err) return err; sprintf(str, "%s Playback Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PSWITCH); if (err) return err; sprintf(str, "%s Playback Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PROUTE); if (err) return err; sprintf(str, "%s Playback Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_PVOLUME); if (err) return err; sprintf(str, "%s Capture Switch", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CSWITCH); if (err) return err; sprintf(str, "%s Capture Route", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CROUTE); if (err) return err; sprintf(str, "%s Capture Volume", ptr->name); err = snd_mixer_oss_build_test(mixer, slot, str, ptr->index, SNDRV_MIXER_OSS_ITEM_CVOLUME); if (err) return err; return 0; } /* * build an OSS mixer element. * ptr_allocated means the entry is dynamically allocated (change via proc file). * when replace_old = 1, the old entry is replaced with the new one. */ static int snd_mixer_oss_build_input(struct snd_mixer_oss *mixer, const struct snd_mixer_oss_assign_table *ptr, int ptr_allocated, int replace_old) { struct slot slot; struct slot *pslot; struct snd_kcontrol *kctl; struct snd_mixer_oss_slot *rslot; char str[64]; /* check if already assigned */ if (mixer->slots[ptr->oss_id].get_volume && ! replace_old) return 0; memset(&slot, 0, sizeof(slot)); memset(slot.numid, 0xff, sizeof(slot.numid)); /* ID_UNKNOWN */ if (snd_mixer_oss_build_test_all(mixer, ptr, &slot)) return 0; down_read(&mixer->card->controls_rwsem); kctl = NULL; if (!ptr->index) kctl = snd_mixer_oss_test_id(mixer, "Capture Source", 0); if (kctl) { struct snd_ctl_elem_info *uinfo; uinfo = kzalloc(sizeof(*uinfo), GFP_KERNEL); if (! uinfo) { up_read(&mixer->card->controls_rwsem); return -ENOMEM; } if (kctl->info(kctl, uinfo)) { up_read(&mixer->card->controls_rwsem); kfree(uinfo); return 0; } strcpy(str, ptr->name); if (!strcmp(str, "Master")) strcpy(str, "Mix"); if (!strcmp(str, "Master Mono")) strcpy(str, "Mix Mono"); slot.capture_item = 0; if (!strcmp(uinfo->value.enumerated.name, str)) { slot.present |= SNDRV_MIXER_OSS_PRESENT_CAPTURE; } else { for (slot.capture_item = 1; slot.capture_item < uinfo->value.enumerated.items; slot.capture_item++) { uinfo->value.enumerated.item = slot.capture_item; if (kctl->info(kctl, uinfo)) { up_read(&mixer->card->controls_rwsem); kfree(uinfo); return 0; } if (!strcmp(uinfo->value.enumerated.name, str)) { slot.present |= SNDRV_MIXER_OSS_PRESENT_CAPTURE; break; } } } kfree(uinfo); } up_read(&mixer->card->controls_rwsem); if (slot.present != 0) { pslot = kmalloc(sizeof(slot), GFP_KERNEL); if (! pslot) return -ENOMEM; *pslot = slot; pslot->signature = SNDRV_MIXER_OSS_SIGNATURE; pslot->assigned = ptr; pslot->allocated = ptr_allocated; rslot = &mixer->slots[ptr->oss_id]; mixer_slot_clear(rslot); rslot->stereo = slot.channels > 1 ? 1 : 0; rslot->get_volume = snd_mixer_oss_get_volume1; rslot->put_volume = snd_mixer_oss_put_volume1; /* note: ES18xx have both Capture Source and XX Capture Volume !!! */ if (slot.present & SNDRV_MIXER_OSS_PRESENT_CSWITCH) { rslot->get_recsrc = snd_mixer_oss_get_recsrc1_sw; rslot->put_recsrc = snd_mixer_oss_put_recsrc1_sw; } else if (slot.present & SNDRV_MIXER_OSS_PRESENT_CROUTE) { rslot->get_recsrc = snd_mixer_oss_get_recsrc1_route; rslot->put_recsrc = snd_mixer_oss_put_recsrc1_route; } else if (slot.present & SNDRV_MIXER_OSS_PRESENT_CAPTURE) { mixer->mask_recsrc |= 1 << ptr->oss_id; } rslot->private_data = pslot; rslot->private_free = snd_mixer_oss_slot_free; return 1; } return 0; } #ifdef CONFIG_SND_PROC_FS /* */ #define MIXER_VOL(name) [SOUND_MIXER_##name] = #name static const char * const oss_mixer_names[SNDRV_OSS_MAX_MIXERS] = { MIXER_VOL(VOLUME), MIXER_VOL(BASS), MIXER_VOL(TREBLE), MIXER_VOL(SYNTH), MIXER_VOL(PCM), MIXER_VOL(SPEAKER), MIXER_VOL(LINE), MIXER_VOL(MIC), MIXER_VOL(CD), MIXER_VOL(IMIX), MIXER_VOL(ALTPCM), MIXER_VOL(RECLEV), MIXER_VOL(IGAIN), MIXER_VOL(OGAIN), MIXER_VOL(LINE1), MIXER_VOL(LINE2), MIXER_VOL(LINE3), MIXER_VOL(DIGITAL1), MIXER_VOL(DIGITAL2), MIXER_VOL(DIGITAL3), MIXER_VOL(PHONEIN), MIXER_VOL(PHONEOUT), MIXER_VOL(VIDEO), MIXER_VOL(RADIO), MIXER_VOL(MONITOR), }; /* * /proc interface */ static void snd_mixer_oss_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_mixer_oss *mixer = entry->private_data; int i; mutex_lock(&mixer->reg_mutex); for (i = 0; i < SNDRV_OSS_MAX_MIXERS; i++) { struct slot *p; if (! oss_mixer_names[i]) continue; p = (struct slot *)mixer->slots[i].private_data; snd_iprintf(buffer, "%s ", oss_mixer_names[i]); if (p && p->assigned) snd_iprintf(buffer, "\"%s\" %d\n", p->assigned->name, p->assigned->index); else snd_iprintf(buffer, "\"\" 0\n"); } mutex_unlock(&mixer->reg_mutex); } static void snd_mixer_oss_proc_write(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_mixer_oss *mixer = entry->private_data; char line[128], str[32], idxstr[16]; const char *cptr; unsigned int idx; int ch; struct snd_mixer_oss_assign_table *tbl; struct slot *slot; while (!snd_info_get_line(buffer, line, sizeof(line))) { cptr = snd_info_get_str(str, line, sizeof(str)); for (ch = 0; ch < SNDRV_OSS_MAX_MIXERS; ch++) if (oss_mixer_names[ch] && strcmp(oss_mixer_names[ch], str) == 0) break; if (ch >= SNDRV_OSS_MAX_MIXERS) { pr_err("ALSA: mixer_oss: invalid OSS volume '%s'\n", str); continue; } cptr = snd_info_get_str(str, cptr, sizeof(str)); if (! *str) { /* remove the entry */ mutex_lock(&mixer->reg_mutex); mixer_slot_clear(&mixer->slots[ch]); mutex_unlock(&mixer->reg_mutex); continue; } snd_info_get_str(idxstr, cptr, sizeof(idxstr)); idx = simple_strtoul(idxstr, NULL, 10); if (idx >= 0x4000) { /* too big */ pr_err("ALSA: mixer_oss: invalid index %d\n", idx); continue; } mutex_lock(&mixer->reg_mutex); slot = (struct slot *)mixer->slots[ch].private_data; if (slot && slot->assigned && slot->assigned->index == idx && ! strcmp(slot->assigned->name, str)) /* not changed */ goto __unlock; tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (!tbl) goto __unlock; tbl->oss_id = ch; tbl->name = kstrdup(str, GFP_KERNEL); if (! tbl->name) { kfree(tbl); goto __unlock; } tbl->index = idx; if (snd_mixer_oss_build_input(mixer, tbl, 1, 1) <= 0) { kfree(tbl->name); kfree(tbl); } __unlock: mutex_unlock(&mixer->reg_mutex); } } static void snd_mixer_oss_proc_init(struct snd_mixer_oss *mixer) { struct snd_info_entry *entry; entry = snd_info_create_card_entry(mixer->card, "oss_mixer", mixer->card->proc_root); if (! entry) return; entry->content = SNDRV_INFO_CONTENT_TEXT; entry->mode = S_IFREG | 0644; entry->c.text.read = snd_mixer_oss_proc_read; entry->c.text.write = snd_mixer_oss_proc_write; entry->private_data = mixer; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); entry = NULL; } mixer->proc_entry = entry; } static void snd_mixer_oss_proc_done(struct snd_mixer_oss *mixer) { snd_info_free_entry(mixer->proc_entry); mixer->proc_entry = NULL; } #else /* !CONFIG_SND_PROC_FS */ #define snd_mixer_oss_proc_init(mix) #define snd_mixer_oss_proc_done(mix) #endif /* CONFIG_SND_PROC_FS */ static void snd_mixer_oss_build(struct snd_mixer_oss *mixer) { static const struct snd_mixer_oss_assign_table table[] = { { SOUND_MIXER_VOLUME, "Master", 0 }, { SOUND_MIXER_VOLUME, "Front", 0 }, /* fallback */ { SOUND_MIXER_BASS, "Tone Control - Bass", 0 }, { SOUND_MIXER_TREBLE, "Tone Control - Treble", 0 }, { SOUND_MIXER_SYNTH, "Synth", 0 }, { SOUND_MIXER_SYNTH, "FM", 0 }, /* fallback */ { SOUND_MIXER_SYNTH, "Music", 0 }, /* fallback */ { SOUND_MIXER_PCM, "PCM", 0 }, { SOUND_MIXER_SPEAKER, "Beep", 0 }, { SOUND_MIXER_SPEAKER, "PC Speaker", 0 }, /* fallback */ { SOUND_MIXER_SPEAKER, "Speaker", 0 }, /* fallback */ { SOUND_MIXER_LINE, "Line", 0 }, { SOUND_MIXER_MIC, "Mic", 0 }, { SOUND_MIXER_CD, "CD", 0 }, { SOUND_MIXER_IMIX, "Monitor Mix", 0 }, { SOUND_MIXER_ALTPCM, "PCM", 1 }, { SOUND_MIXER_ALTPCM, "Headphone", 0 }, /* fallback */ { SOUND_MIXER_ALTPCM, "Wave", 0 }, /* fallback */ { SOUND_MIXER_RECLEV, "-- nothing --", 0 }, { SOUND_MIXER_IGAIN, "Capture", 0 }, { SOUND_MIXER_OGAIN, "Playback", 0 }, { SOUND_MIXER_LINE1, "Aux", 0 }, { SOUND_MIXER_LINE2, "Aux", 1 }, { SOUND_MIXER_LINE3, "Aux", 2 }, { SOUND_MIXER_DIGITAL1, "Digital", 0 }, { SOUND_MIXER_DIGITAL1, "IEC958", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL1, "IEC958 Optical", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL1, "IEC958 Coaxial", 0 }, /* fallback */ { SOUND_MIXER_DIGITAL2, "Digital", 1 }, { SOUND_MIXER_DIGITAL3, "Digital", 2 }, { SOUND_MIXER_PHONEIN, "Phone", 0 }, { SOUND_MIXER_PHONEOUT, "Master Mono", 0 }, { SOUND_MIXER_PHONEOUT, "Speaker", 0 }, /*fallback*/ { SOUND_MIXER_PHONEOUT, "Mono", 0 }, /*fallback*/ { SOUND_MIXER_PHONEOUT, "Phone", 0 }, /* fallback */ { SOUND_MIXER_VIDEO, "Video", 0 }, { SOUND_MIXER_RADIO, "Radio", 0 }, { SOUND_MIXER_MONITOR, "Monitor", 0 } }; unsigned int idx; for (idx = 0; idx < ARRAY_SIZE(table); idx++) snd_mixer_oss_build_input(mixer, &table[idx], 0, 0); if (mixer->mask_recsrc) { mixer->get_recsrc = snd_mixer_oss_get_recsrc2; mixer->put_recsrc = snd_mixer_oss_put_recsrc2; } } /* * */ static int snd_mixer_oss_free1(void *private) { struct snd_mixer_oss *mixer = private; struct snd_card *card; int idx; if (!mixer) return 0; card = mixer->card; if (snd_BUG_ON(mixer != card->mixer_oss)) return -ENXIO; card->mixer_oss = NULL; for (idx = 0; idx < SNDRV_OSS_MAX_MIXERS; idx++) { struct snd_mixer_oss_slot *chn = &mixer->slots[idx]; if (chn->private_free) chn->private_free(chn); } kfree(mixer); return 0; } static int snd_mixer_oss_notify_handler(struct snd_card *card, int cmd) { struct snd_mixer_oss *mixer; if (cmd == SND_MIXER_OSS_NOTIFY_REGISTER) { int idx, err; mixer = kcalloc(2, sizeof(*mixer), GFP_KERNEL); if (mixer == NULL) return -ENOMEM; mutex_init(&mixer->reg_mutex); err = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_MIXER, card, 0, &snd_mixer_oss_f_ops, card); if (err < 0) { dev_err(card->dev, "unable to register OSS mixer device %i:%i\n", card->number, 0); kfree(mixer); return err; } mixer->oss_dev_alloc = 1; mixer->card = card; if (*card->mixername) strscpy(mixer->name, card->mixername, sizeof(mixer->name)); else snprintf(mixer->name, sizeof(mixer->name), "mixer%i", card->number); #ifdef SNDRV_OSS_INFO_DEV_MIXERS snd_oss_info_register(SNDRV_OSS_INFO_DEV_MIXERS, card->number, mixer->name); #endif for (idx = 0; idx < SNDRV_OSS_MAX_MIXERS; idx++) mixer->slots[idx].number = idx; card->mixer_oss = mixer; snd_mixer_oss_build(mixer); snd_mixer_oss_proc_init(mixer); } else { mixer = card->mixer_oss; if (mixer == NULL) return 0; if (mixer->oss_dev_alloc) { #ifdef SNDRV_OSS_INFO_DEV_MIXERS snd_oss_info_unregister(SNDRV_OSS_INFO_DEV_MIXERS, mixer->card->number); #endif snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_MIXER, mixer->card, 0); mixer->oss_dev_alloc = 0; } if (cmd == SND_MIXER_OSS_NOTIFY_DISCONNECT) return 0; snd_mixer_oss_proc_done(mixer); return snd_mixer_oss_free1(mixer); } return 0; } static int __init alsa_mixer_oss_init(void) { struct snd_card *card; int idx; snd_mixer_oss_notify_callback = snd_mixer_oss_notify_handler; for (idx = 0; idx < SNDRV_CARDS; idx++) { card = snd_card_ref(idx); if (card) { snd_mixer_oss_notify_handler(card, SND_MIXER_OSS_NOTIFY_REGISTER); snd_card_unref(card); } } return 0; } static void __exit alsa_mixer_oss_exit(void) { struct snd_card *card; int idx; snd_mixer_oss_notify_callback = NULL; for (idx = 0; idx < SNDRV_CARDS; idx++) { card = snd_card_ref(idx); if (card) { snd_mixer_oss_notify_handler(card, SND_MIXER_OSS_NOTIFY_FREE); snd_card_unref(card); } } } module_init(alsa_mixer_oss_init) module_exit(alsa_mixer_oss_exit)
12 49 7 42 1 42 7 40 9 42 7 1 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 // SPDX-License-Identifier: GPL-2.0-only /* * ialloc.c * * PURPOSE * Inode allocation handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998-2001 Ben Fennema * * HISTORY * * 02/24/99 blf Created. * */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/sched.h> #include <linux/slab.h> #include "udf_i.h" #include "udf_sb.h" void udf_free_inode(struct inode *inode) { udf_free_blocks(inode->i_sb, NULL, &UDF_I(inode)->i_location, 0, 1); } struct inode *udf_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct inode *inode; udf_pblk_t block; uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); int err; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); iinfo = UDF_I(inode); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { iinfo->i_efe = 1; if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; iinfo->i_data = kzalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL); } else { iinfo->i_efe = 0; iinfo->i_data = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); } if (!iinfo->i_data) { make_bad_inode(inode); iput(inode); return ERR_PTR(-ENOMEM); } err = -ENOSPC; block = udf_new_block(dir->i_sb, NULL, dinfo->i_location.partitionReferenceNum, start, &err); if (err) { make_bad_inode(inode); iput(inode); return ERR_PTR(err); } iinfo->i_unique = lvid_get_unique_id(sb); inode->i_generation = iinfo->i_unique; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) inode->i_gid = sbi->s_gid; iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = dinfo->i_location.partitionReferenceNum; inode->i_ino = udf_get_lb_pblock(sb, &iinfo->i_location, 0); inode->i_blocks = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; iinfo->i_use = 0; iinfo->i_checkpoint = 1; iinfo->i_extraPerms = FE_PERM_U_CHATTR; udf_update_extra_perms(inode, mode); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; simple_inode_init_ts(inode); iinfo->i_crtime = inode_get_mtime(inode); if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); return ERR_PTR(-EIO); } mark_inode_dirty(inode); return inode; }
141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 // SPDX-License-Identifier: GPL-2.0 struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; struct io_ring_ctx *ctx; }; int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx); void io_uring_clean_tctx(struct io_uring_task *tctx); void io_uring_unreg_ringfd(void); int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args); int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args); /* * Note that this task has used io_uring. We use it for cancelation purposes. */ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; if (likely(tctx && tctx->last == ctx)) return 0; return __io_uring_add_tctx_node_from_submit(ctx); }
2684 2679 56 6 6 3 3 745 86 59 1050 4 70 70 34 33 36 1050 487 632 412 219 1083 1048 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 // SPDX-License-Identifier: GPL-2.0-only /* tnum: tracked (or tristate) numbers * * A tnum tracks knowledge about the bits of a value. Each bit can be either * known (0 or 1), or unknown (x). Arithmetic operations on tnums will * propagate the unknown bits such that the tnum result represents all the * possible results for possible values of the operands. */ #include <linux/kernel.h> #include <linux/tnum.h> #define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} /* A completely unknown value */ const struct tnum tnum_unknown = { .value = 0, .mask = -1 }; struct tnum tnum_const(u64 value) { return TNUM(value, 0); } struct tnum tnum_range(u64 min, u64 max) { u64 chi = min ^ max, delta; u8 bits = fls64(chi); /* special case, needed because 1ULL << 64 is undefined */ if (bits > 63) return tnum_unknown; /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7. * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return * constant min (since min == max). */ delta = (1ULL << bits) - 1; return TNUM(min & ~delta, delta); } struct tnum tnum_lshift(struct tnum a, u8 shift) { return TNUM(a.value << shift, a.mask << shift); } struct tnum tnum_rshift(struct tnum a, u8 shift) { return TNUM(a.value >> shift, a.mask >> shift); } struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) { /* if a.value is negative, arithmetic shifting by minimum shift * will have larger negative offset compared to more shifting. * If a.value is nonnegative, arithmetic shifting by minimum shift * will have larger positive offset compare to more shifting. */ if (insn_bitness == 32) return TNUM((u32)(((s32)a.value) >> min_shift), (u32)(((s32)a.mask) >> min_shift)); else return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); } struct tnum tnum_add(struct tnum a, struct tnum b) { u64 sm, sv, sigma, chi, mu; sm = a.mask + b.mask; sv = a.value + b.value; sigma = sm + sv; chi = sigma ^ sv; mu = chi | a.mask | b.mask; return TNUM(sv & ~mu, mu); } struct tnum tnum_sub(struct tnum a, struct tnum b) { u64 dv, alpha, beta, chi, mu; dv = a.value - b.value; alpha = dv + a.mask; beta = dv - b.mask; chi = alpha ^ beta; mu = chi | a.mask | b.mask; return TNUM(dv & ~mu, mu); } struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; alpha = a.value | a.mask; beta = b.value | b.mask; v = a.value & b.value; return TNUM(v, alpha & beta & ~v); } struct tnum tnum_or(struct tnum a, struct tnum b) { u64 v, mu; v = a.value | b.value; mu = a.mask | b.mask; return TNUM(v, mu & ~v); } struct tnum tnum_xor(struct tnum a, struct tnum b) { u64 v, mu; v = a.value ^ b.value; mu = a.mask | b.mask; return TNUM(v & ~mu, mu); } /* Generate partial products by multiplying each bit in the multiplier (tnum a) * with the multiplicand (tnum b), and add the partial products after * appropriately bit-shifting them. Instead of directly performing tnum addition * on the generated partial products, equivalenty, decompose each partial * product into two tnums, consisting of the value-sum (acc_v) and the * mask-sum (acc_m) and then perform tnum addition on them. The following paper * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398. */ struct tnum tnum_mul(struct tnum a, struct tnum b) { u64 acc_v = a.value * b.value; struct tnum acc_m = TNUM(0, 0); while (a.value || a.mask) { /* LSB of tnum a is a certain 1 */ if (a.value & 1) acc_m = tnum_add(acc_m, TNUM(0, b.mask)); /* LSB of tnum a is uncertain */ else if (a.mask & 1) acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask)); /* Note: no case for LSB is certain 0 */ a = tnum_rshift(a, 1); b = tnum_lshift(b, 1); } return tnum_add(TNUM(acc_v, 0), acc_m); } /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has * a 'known 0' - this will return a 'known 1' for that bit. */ struct tnum tnum_intersect(struct tnum a, struct tnum b) { u64 v, mu; v = a.value | b.value; mu = a.mask & b.mask; return TNUM(v & ~mu, mu); } struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; a.mask &= (1ULL << (size * 8)) - 1; return a; } bool tnum_is_aligned(struct tnum a, u64 size) { if (!size) return true; return !((a.value | a.mask) & (size - 1)); } bool tnum_in(struct tnum a, struct tnum b) { if (b.mask & ~a.mask) return false; b.value &= ~a.mask; return a.value == b.value; } int tnum_strn(char *str, size_t size, struct tnum a) { return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); } EXPORT_SYMBOL_GPL(tnum_strn); int tnum_sbin(char *str, size_t size, struct tnum a) { size_t n; for (n = 64; n; n--) { if (n < size) { if (a.mask & 1) str[n - 1] = 'x'; else if (a.value & 1) str[n - 1] = '1'; else str[n - 1] = '0'; } a.mask >>= 1; a.value >>= 1; } str[min(size - 1, (size_t)64)] = 0; return 64; } struct tnum tnum_subreg(struct tnum a) { return tnum_cast(a, 4); } struct tnum tnum_clear_subreg(struct tnum a) { return tnum_lshift(tnum_rshift(a, 32), 32); } struct tnum tnum_const_subreg(struct tnum a, u32 value) { return tnum_or(tnum_clear_subreg(a), tnum_const(value)); }
67 20 67 67 20 67 62 63 62 54 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 // SPDX-License-Identifier: GPL-2.0 #include "reiserfs.h" #include <linux/mutex.h> /* * The previous reiserfs locking scheme was heavily based on * the tricky properties of the Bkl: * * - it was acquired recursively by a same task * - the performances relied on the release-while-schedule() property * * Now that we replace it by a mutex, we still want to keep the same * recursive property to avoid big changes in the code structure. * We use our own lock_owner here because the owner field on a mutex * is only available in SMP or mutex debugging, also we only need this field * for this mutex, no need for a system wide mutex facility. * * Also this lock is often released before a call that could block because * reiserfs performances were partially based on the release while schedule() * property of the Bkl. */ void reiserfs_write_lock(struct super_block *s) { struct reiserfs_sb_info *sb_i = REISERFS_SB(s); if (sb_i->lock_owner != current) { mutex_lock(&sb_i->lock); sb_i->lock_owner = current; } /* No need to protect it, only the current task touches it */ sb_i->lock_depth++; } void reiserfs_write_unlock(struct super_block *s) { struct reiserfs_sb_info *sb_i = REISERFS_SB(s); /* * Are we unlocking without even holding the lock? * Such a situation must raise a BUG() if we don't want * to corrupt the data. */ BUG_ON(sb_i->lock_owner != current); if (--sb_i->lock_depth == -1) { sb_i->lock_owner = NULL; mutex_unlock(&sb_i->lock); } } int __must_check reiserfs_write_unlock_nested(struct super_block *s) { struct reiserfs_sb_info *sb_i = REISERFS_SB(s); int depth; /* this can happen when the lock isn't always held */ if (sb_i->lock_owner != current) return -1; depth = sb_i->lock_depth; sb_i->lock_depth = -1; sb_i->lock_owner = NULL; mutex_unlock(&sb_i->lock); return depth; } void reiserfs_write_lock_nested(struct super_block *s, int depth) { struct reiserfs_sb_info *sb_i = REISERFS_SB(s); /* this can happen when the lock isn't always held */ if (depth == -1) return; mutex_lock(&sb_i->lock); sb_i->lock_owner = current; sb_i->lock_depth = depth; } /* * Utility function to force a BUG if it is called without the superblock * write lock held. caller is the string printed just before calling BUG() */ void reiserfs_check_lock_depth(struct super_block *sb, char *caller) { struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); WARN_ON(sb_i->lock_depth < 0); } #ifdef CONFIG_REISERFS_CHECK void reiserfs_lock_check_recursive(struct super_block *sb) { struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); } #endif
176 194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MMU_CONTEXT_H #define _ASM_X86_MMU_CONTEXT_H #include <asm/desc.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/pkeys.h> #include <trace/events/tlb.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/gsseg.h> extern atomic64_t last_mm_ctx_id; #ifdef CONFIG_PERF_EVENTS DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key); DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); void cr4_update_pce(void *ignored); #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL /* * ldt_structs can be allocated, used, and freed, but they are never * modified while live. */ struct ldt_struct { /* * Xen requires page-aligned LDTs with special permissions. This is * needed to prevent us from installing evil descriptors such as * call gates. On native, we could merge the ldt_struct and LDT * allocations, but it's not worth trying to optimize. */ struct desc_struct *entries; unsigned int nr_entries; /* * If PTI is in use, then the entries array is not mapped while we're * in user mode. The whole array will be aliased at the addressed * given by ldt_slot_va(slot). We use two slots so that we can allocate * and map, and enable a new LDT without invalidating the mapping * of an older, still-in-use LDT. * * slot will be -1 if this LDT doesn't have an alias mapping. */ int slot; }; /* * Used for LDT copy/destruction. */ static inline void init_new_context_ldt(struct mm_struct *mm) { mm->context.ldt = NULL; init_rwsem(&mm->context.ldt_usr_sem); } int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); void ldt_arch_exit_mmap(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ static inline void init_new_context_ldt(struct mm_struct *mm) { } static inline int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm) { return 0; } static inline void destroy_context_ldt(struct mm_struct *mm) { } static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL extern void load_mm_ldt(struct mm_struct *mm); extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next); #else static inline void load_mm_ldt(struct mm_struct *mm) { clear_LDT(); } static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next) { DEBUG_LOCKS_WARN_ON(preemptible()); } #endif #ifdef CONFIG_ADDRESS_MASKING static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { return mm->context.lam_cr3_mask; } static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) { mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask; mm->context.untag_mask = oldmm->context.untag_mask; } #define mm_untag_mask mm_untag_mask static inline unsigned long mm_untag_mask(struct mm_struct *mm) { return mm->context.untag_mask; } static inline void mm_reset_untag_mask(struct mm_struct *mm) { mm->context.untag_mask = -1UL; } #define arch_pgtable_dma_compat arch_pgtable_dma_compat static inline bool arch_pgtable_dma_compat(struct mm_struct *mm) { return !mm_lam_cr3_mask(mm) || test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags); } #else static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm) { return 0; } static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm) { } static inline void mm_reset_untag_mask(struct mm_struct *mm) { } #endif #define enter_lazy_tlb enter_lazy_tlb extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); /* * Init a new mm. Used on mm copies, like at fork() * and on mm's that are brand-new, like at execve(). */ #define init_new_context init_new_context static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { mutex_init(&mm->context.lock); mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); atomic64_set(&mm->context.tlb_gen, 0); #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and allocated implicitly */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; } #endif mm_reset_untag_mask(mm); init_new_context_ldt(mm); return 0; } #define destroy_context destroy_context static inline void destroy_context(struct mm_struct *mm) { destroy_context_ldt(mm); } extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); #define switch_mm_irqs_off switch_mm_irqs_off #define activate_mm(prev, next) \ do { \ paravirt_enter_mmap(next); \ switch_mm((prev), (next), NULL); \ } while (0); #ifdef CONFIG_X86_32 #define deactivate_mm(tsk, mm) \ do { \ loadsegment(gs, 0); \ } while (0) #else #define deactivate_mm(tsk, mm) \ do { \ shstk_free(tsk); \ load_gs_index(0); \ loadsegment(fs, 0); \ } while (0) #endif static inline void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm) { #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; /* Duplicate the oldmm pkey state in mm: */ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map; mm->context.execute_only_pkey = oldmm->context.execute_only_pkey; #endif } static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { arch_dup_pkeys(oldmm, mm); paravirt_enter_mmap(mm); dup_lam(oldmm, mm); return ldt_dup_context(oldmm, mm); } static inline void arch_exit_mmap(struct mm_struct *mm) { paravirt_arch_exit_mmap(mm); ldt_arch_exit_mmap(mm); } #ifdef CONFIG_X86_64 static inline bool is_64bit_mm(struct mm_struct *mm) { return !IS_ENABLED(CONFIG_IA32_EMULATION) || !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags); } #else static inline bool is_64bit_mm(struct mm_struct *mm) { return false; } #endif static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { } /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other * processes or any way to tell *which * PKRU in a threaded * process we could use. * * So do not enforce things if the VMA is not from the current * mm, or if we are in a kernel thread. */ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write, bool execute, bool foreign) { /* pkeys never affect instruction fetches */ if (execute) return true; /* allow access if the VMA is not one from this process */ if (foreign || vma_is_foreign(vma)) return true; return __pkru_allows_pkey(vma_pkey(vma), write); } unsigned long __get_current_cr3_fast(void); #include <asm-generic/mmu_context.h> #endif /* _ASM_X86_MMU_CONTEXT_H */
442 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Dynamic loading of modules into the kernel. * * Rewritten by Richard Henderson <rth@tamu.edu> Dec 1996 * Rewritten again by Rusty Russell, 2002 */ #ifndef _LINUX_MODULE_H #define _LINUX_MODULE_H #include <linux/list.h> #include <linux/stat.h> #include <linux/buildid.h> #include <linux/compiler.h> #include <linux/cache.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/elf.h> #include <linux/stringify.h> #include <linux/kobject.h> #include <linux/moduleparam.h> #include <linux/jump_label.h> #include <linux/export.h> #include <linux/rbtree_latch.h> #include <linux/error-injection.h> #include <linux/tracepoint-defs.h> #include <linux/srcu.h> #include <linux/static_call_types.h> #include <linux/dynamic_debug.h> #include <linux/percpu.h> #include <asm/module.h> #define MODULE_NAME_LEN MAX_PARAM_PREFIX_LEN struct modversion_info { unsigned long crc; char name[MODULE_NAME_LEN]; }; struct module; struct exception_table_entry; struct module_kobject { struct kobject kobj; struct module *mod; struct kobject *drivers_dir; struct module_param_attrs *mp; struct completion *kobj_completion; } __randomize_layout; struct module_attribute { struct attribute attr; ssize_t (*show)(struct module_attribute *, struct module_kobject *, char *); ssize_t (*store)(struct module_attribute *, struct module_kobject *, const char *, size_t count); void (*setup)(struct module *, const char *); int (*test)(struct module *); void (*free)(struct module *); }; struct module_version_attribute { struct module_attribute mattr; const char *module_name; const char *version; }; extern ssize_t __modver_version_show(struct module_attribute *, struct module_kobject *, char *); extern struct module_attribute module_uevent; /* These are either module local, or the kernel's dummy ones. */ extern int init_module(void); extern void cleanup_module(void); #ifndef MODULE /** * module_init() - driver initialization entry point * @x: function to be run at kernel boot time or module insertion * * module_init() will either be called during do_initcalls() (if * builtin) or at module insertion time (if a module). There can only * be one per module. */ #define module_init(x) __initcall(x); /** * module_exit() - driver exit entry point * @x: function to be run when driver is removed * * module_exit() will wrap the driver clean-up code * with cleanup_module() when used with rmmod when * the driver is a module. If the driver is statically * compiled into the kernel, module_exit() has no effect. * There can only be one per module. */ #define module_exit(x) __exitcall(x); #else /* MODULE */ /* * In most cases loadable modules do not need custom * initcall levels. There are still some valid cases where * a driver may be needed early if built in, and does not * matter when built as a loadable module. Like bus * snooping debug drivers. */ #define early_initcall(fn) module_init(fn) #define core_initcall(fn) module_init(fn) #define core_initcall_sync(fn) module_init(fn) #define postcore_initcall(fn) module_init(fn) #define postcore_initcall_sync(fn) module_init(fn) #define arch_initcall(fn) module_init(fn) #define subsys_initcall(fn) module_init(fn) #define subsys_initcall_sync(fn) module_init(fn) #define fs_initcall(fn) module_init(fn) #define fs_initcall_sync(fn) module_init(fn) #define rootfs_initcall(fn) module_init(fn) #define device_initcall(fn) module_init(fn) #define device_initcall_sync(fn) module_init(fn) #define late_initcall(fn) module_init(fn) #define late_initcall_sync(fn) module_init(fn) #define console_initcall(fn) module_init(fn) /* Each module must use one module_init(). */ #define module_init(initfn) \ static inline initcall_t __maybe_unused __inittest(void) \ { return initfn; } \ int init_module(void) __copy(initfn) \ __attribute__((alias(#initfn))); \ ___ADDRESSABLE(init_module, __initdata); /* This is only required if you want to be unloadable. */ #define module_exit(exitfn) \ static inline exitcall_t __maybe_unused __exittest(void) \ { return exitfn; } \ void cleanup_module(void) __copy(exitfn) \ __attribute__((alias(#exitfn))); \ ___ADDRESSABLE(cleanup_module, __exitdata); #endif /* This means "can be init if no module support, otherwise module load may call it." */ #ifdef CONFIG_MODULES #define __init_or_module #define __initdata_or_module #define __initconst_or_module #define __INIT_OR_MODULE .text #define __INITDATA_OR_MODULE .data #define __INITRODATA_OR_MODULE .section ".rodata","a",%progbits #else #define __init_or_module __init #define __initdata_or_module __initdata #define __initconst_or_module __initconst #define __INIT_OR_MODULE __INIT #define __INITDATA_OR_MODULE __INITDATA #define __INITRODATA_OR_MODULE __INITRODATA #endif /*CONFIG_MODULES*/ /* Generic info of form tag = "info" */ #define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info) /* For userspace: you can also call me... */ #define MODULE_ALIAS(_alias) MODULE_INFO(alias, _alias) /* Soft module dependencies. See man modprobe.d for details. * Example: MODULE_SOFTDEP("pre: module-foo module-bar post: module-baz") */ #define MODULE_SOFTDEP(_softdep) MODULE_INFO(softdep, _softdep) /* * MODULE_FILE is used for generating modules.builtin * So, make it no-op when this is being built as a module */ #ifdef MODULE #define MODULE_FILE #else #define MODULE_FILE MODULE_INFO(file, KBUILD_MODFILE); #endif /* * The following license idents are currently accepted as indicating free * software modules * * "GPL" [GNU Public License v2] * "GPL v2" [GNU Public License v2] * "GPL and additional rights" [GNU Public License v2 rights and more] * "Dual BSD/GPL" [GNU Public License v2 * or BSD license choice] * "Dual MIT/GPL" [GNU Public License v2 * or MIT license choice] * "Dual MPL/GPL" [GNU Public License v2 * or Mozilla license choice] * * The following other idents are available * * "Proprietary" [Non free products] * * Both "GPL v2" and "GPL" (the latter also in dual licensed strings) are * merely stating that the module is licensed under the GPL v2, but are not * telling whether "GPL v2 only" or "GPL v2 or later". The reason why there * are two variants is a historic and failed attempt to convey more * information in the MODULE_LICENSE string. For module loading the * "only/or later" distinction is completely irrelevant and does neither * replace the proper license identifiers in the corresponding source file * nor amends them in any way. The sole purpose is to make the * 'Proprietary' flagging work and to refuse to bind symbols which are * exported with EXPORT_SYMBOL_GPL when a non free module is loaded. * * In the same way "BSD" is not a clear license information. It merely * states, that the module is licensed under one of the compatible BSD * license variants. The detailed and correct license information is again * to be found in the corresponding source files. * * There are dual licensed components, but when running with Linux it is the * GPL that is relevant so this is a non issue. Similarly LGPL linked with GPL * is a GPL combined work. * * This exists for several reasons * 1. So modinfo can show license info for users wanting to vet their setup * is free * 2. So the community can ignore bug reports including proprietary modules * 3. So vendors can do likewise based on their own policies */ #define MODULE_LICENSE(_license) MODULE_FILE MODULE_INFO(license, _license) /* * Author(s), use "Name <email>" or just "Name", for multiple * authors use multiple MODULE_AUTHOR() statements/lines. */ #define MODULE_AUTHOR(_author) MODULE_INFO(author, _author) /* What your module does. */ #define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description) #ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ extern typeof(name) __mod_##type##__##name##_device_table \ __attribute__ ((unused, alias(__stringify(name)))) #else /* !MODULE */ #define MODULE_DEVICE_TABLE(type, name) #endif /* Version of form [<epoch>:]<version>[-<extra-version>]. * Or for CVS/RCS ID version, everything but the number is stripped. * <epoch>: A (small) unsigned integer which allows you to start versions * anew. If not mentioned, it's zero. eg. "2:1.0" is after * "1:2.0". * <version>: The <version> may contain only alphanumerics and the * character `.'. Ordered by numeric sort for numeric parts, * ascii sort for ascii parts (as per RPM or DEB algorithm). * <extraversion>: Like <version>, but inserted for local * customizations, eg "rh3" or "rusty1". * Using this automatically adds a checksum of the .c files and the * local headers in "srcversion". */ #if defined(MODULE) || !defined(CONFIG_SYSFS) #define MODULE_VERSION(_version) MODULE_INFO(version, _version) #else #define MODULE_VERSION(_version) \ MODULE_INFO(version, _version); \ static struct module_version_attribute __modver_attr \ __used __section("__modver") \ __aligned(__alignof__(struct module_version_attribute)) \ = { \ .mattr = { \ .attr = { \ .name = "version", \ .mode = S_IRUGO, \ }, \ .show = __modver_version_show, \ }, \ .module_name = KBUILD_MODNAME, \ .version = _version, \ } #endif /* Optional firmware file (or files) needed by the module * format is simply firmware file name. Multiple firmware * files require multiple MODULE_FIRMWARE() specifiers */ #define MODULE_FIRMWARE(_firmware) MODULE_INFO(firmware, _firmware) #define MODULE_IMPORT_NS(ns) MODULE_INFO(import_ns, __stringify(ns)) struct notifier_block; #ifdef CONFIG_MODULES extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); #define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x)))) /* modules using other modules: kdb wants to see this. */ struct module_use { struct list_head source_list; struct list_head target_list; struct module *source, *target; }; enum module_state { MODULE_STATE_LIVE, /* Normal state. */ MODULE_STATE_COMING, /* Full formed, running module_init. */ MODULE_STATE_GOING, /* Going away. */ MODULE_STATE_UNFORMED, /* Still setting it up. */ }; struct mod_tree_node { struct module *mod; struct latch_tree_node node; }; enum mod_mem_type { MOD_TEXT = 0, MOD_DATA, MOD_RODATA, MOD_RO_AFTER_INIT, MOD_INIT_TEXT, MOD_INIT_DATA, MOD_INIT_RODATA, MOD_MEM_NUM_TYPES, MOD_INVALID = -1, }; #define mod_mem_type_is_init(type) \ ((type) == MOD_INIT_TEXT || \ (type) == MOD_INIT_DATA || \ (type) == MOD_INIT_RODATA) #define mod_mem_type_is_core(type) (!mod_mem_type_is_init(type)) #define mod_mem_type_is_text(type) \ ((type) == MOD_TEXT || \ (type) == MOD_INIT_TEXT) #define mod_mem_type_is_data(type) (!mod_mem_type_is_text(type)) #define mod_mem_type_is_core_data(type) \ (mod_mem_type_is_core(type) && \ mod_mem_type_is_data(type)) #define for_each_mod_mem_type(type) \ for (enum mod_mem_type (type) = 0; \ (type) < MOD_MEM_NUM_TYPES; (type)++) #define for_class_mod_mem_type(type, class) \ for_each_mod_mem_type(type) \ if (mod_mem_type_is_##class(type)) struct module_memory { void *base; unsigned int size; #ifdef CONFIG_MODULES_TREE_LOOKUP struct mod_tree_node mtn; #endif }; #ifdef CONFIG_MODULES_TREE_LOOKUP /* Only touch one cacheline for common rbtree-for-core-layout case. */ #define __module_memory_align ____cacheline_aligned #else #define __module_memory_align #endif struct mod_kallsyms { Elf_Sym *symtab; unsigned int num_symtab; char *strtab; char *typetab; }; #ifdef CONFIG_LIVEPATCH /** * struct klp_modinfo - ELF information preserved from the livepatch module * * @hdr: ELF header * @sechdrs: Section header table * @secstrings: String table for the section headers * @symndx: The symbol table section index */ struct klp_modinfo { Elf_Ehdr hdr; Elf_Shdr *sechdrs; char *secstrings; unsigned int symndx; }; #endif struct module { enum module_state state; /* Member of list of modules */ struct list_head list; /* Unique handle for this module */ char name[MODULE_NAME_LEN]; #ifdef CONFIG_STACKTRACE_BUILD_ID /* Module build ID */ unsigned char build_id[BUILD_ID_SIZE_MAX]; #endif /* Sysfs stuff. */ struct module_kobject mkobj; struct module_attribute *modinfo_attrs; const char *version; const char *srcversion; struct kobject *holders_dir; /* Exported symbols */ const struct kernel_symbol *syms; const s32 *crcs; unsigned int num_syms; #ifdef CONFIG_ARCH_USES_CFI_TRAPS s32 *kcfi_traps; s32 *kcfi_traps_end; #endif /* Kernel parameters. */ #ifdef CONFIG_SYSFS struct mutex param_lock; #endif struct kernel_param *kp; unsigned int num_kp; /* GPL-only exported symbols. */ unsigned int num_gpl_syms; const struct kernel_symbol *gpl_syms; const s32 *gpl_crcs; bool using_gplonly_symbols; #ifdef CONFIG_MODULE_SIG /* Signature was verified. */ bool sig_ok; #endif bool async_probe_requested; /* Exception table */ unsigned int num_exentries; struct exception_table_entry *extable; /* Startup function. */ int (*init)(void); struct module_memory mem[MOD_MEM_NUM_TYPES] __module_memory_align; /* Arch-specific module values */ struct mod_arch_specific arch; unsigned long taints; /* same bits as kernel:taint_flags */ #ifdef CONFIG_GENERIC_BUG /* Support for BUG */ unsigned num_bugs; struct list_head bug_list; struct bug_entry *bug_table; #endif #ifdef CONFIG_KALLSYMS /* Protected by RCU and/or module_mutex: use rcu_dereference() */ struct mod_kallsyms __rcu *kallsyms; struct mod_kallsyms core_kallsyms; /* Section attributes */ struct module_sect_attrs *sect_attrs; /* Notes attributes */ struct module_notes_attrs *notes_attrs; #endif /* The command line arguments (may be mangled). People like keeping pointers to this stuff */ char *args; #ifdef CONFIG_SMP /* Per-cpu data. */ void __percpu *percpu; unsigned int percpu_size; #endif void *noinstr_text_start; unsigned int noinstr_text_size; #ifdef CONFIG_TRACEPOINTS unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif #ifdef CONFIG_TREE_SRCU unsigned int num_srcu_structs; struct srcu_struct **srcu_struct_ptrs; #endif #ifdef CONFIG_BPF_EVENTS unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif #ifdef CONFIG_DEBUG_INFO_BTF_MODULES unsigned int btf_data_size; void *btf_data; #endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; #endif #ifdef CONFIG_TRACING unsigned int num_trace_bprintk_fmt; const char **trace_bprintk_fmt_start; #endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call **trace_events; unsigned int num_trace_events; struct trace_eval_map **trace_evals; unsigned int num_trace_evals; #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD unsigned int num_ftrace_callsites; unsigned long *ftrace_callsites; #endif #ifdef CONFIG_KPROBES void *kprobes_text_start; unsigned int kprobes_text_size; unsigned long *kprobe_blacklist; unsigned int num_kprobe_blacklist; #endif #ifdef CONFIG_HAVE_STATIC_CALL_INLINE int num_static_call_sites; struct static_call_site *static_call_sites; #endif #if IS_ENABLED(CONFIG_KUNIT) int num_kunit_suites; struct kunit_suite **kunit_suites; #endif #ifdef CONFIG_LIVEPATCH bool klp; /* Is this a livepatch module? */ bool klp_alive; /* ELF information */ struct klp_modinfo *klp_info; #endif #ifdef CONFIG_PRINTK_INDEX unsigned int printk_index_size; struct pi_entry **printk_index_start; #endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ struct list_head source_list; /* What modules do I depend on? */ struct list_head target_list; /* Destruction function. */ void (*exit)(void); atomic_t refcnt; #endif #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; unsigned int num_ctors; #endif #ifdef CONFIG_FUNCTION_ERROR_INJECTION struct error_injection_entry *ei_funcs; unsigned int num_ei_funcs; #endif #ifdef CONFIG_DYNAMIC_DEBUG_CORE struct _ddebug_info dyndbg_info; #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} #endif #ifndef HAVE_ARCH_KALLSYMS_SYMBOL_VALUE static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) { return sym->st_value; } #endif /* FIXME: It'd be nice to isolate modules during init, too, so they aren't used before they (may) fail. But presently too much code (IDE & SCSI) require entry into the module during init.*/ static inline bool module_is_live(struct module *mod) { return mod->state != MODULE_STATE_GOING; } struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr); bool is_module_percpu_address(unsigned long addr); bool is_module_text_address(unsigned long addr); static inline bool within_module_mem_type(unsigned long addr, const struct module *mod, enum mod_mem_type type) { unsigned long base, size; base = (unsigned long)mod->mem[type].base; size = mod->mem[type].size; return addr - base < size; } static inline bool within_module_core(unsigned long addr, const struct module *mod) { for_class_mod_mem_type(type, core) { if (within_module_mem_type(addr, mod, type)) return true; } return false; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { for_class_mod_mem_type(type, init) { if (within_module_mem_type(addr, mod, type)) return true; } return false; } static inline bool within_module(unsigned long addr, const struct module *mod) { return within_module_init(addr, mod) || within_module_core(addr, mod); } /* Search for module by name: must be in a RCU-sched critical section. */ struct module *find_module(const char *name); extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD int module_refcount(struct module *mod); void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(__stringify(x)) void symbol_put_addr(void *addr); /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ extern void __module_get(struct module *module); /** * try_module_get() - take module refcount unless module is being removed * @module: the module we should check for * * Only try to get a module reference count if the module is not being removed. * This call will fail if the module is already being removed. * * Care must also be taken to ensure the module exists and is alive prior to * usage of this call. This can be gauranteed through two means: * * 1) Direct protection: you know an earlier caller must have increased the * module reference through __module_get(). This can typically be achieved * by having another entity other than the module itself increment the * module reference count. * * 2) Implied protection: there is an implied protection against module * removal. An example of this is the implied protection used by kernfs / * sysfs. The sysfs store / read file operations are guaranteed to exist * through the use of kernfs's active reference (see kernfs_active()) and a * sysfs / kernfs file removal cannot happen unless the same file is not * active. Therefore, if a sysfs file is being read or written to the module * which created it must still exist. It is therefore safe to use * try_module_get() on module sysfs store / read ops. * * One of the real values to try_module_get() is the module_is_live() check * which ensures that the caller of try_module_get() can yield to userspace * module removal requests and gracefully fail if the module is on its way out. * * Returns true if the reference count was successfully incremented. */ extern bool try_module_get(struct module *module); /** * module_put() - release a reference count to a module * @module: the module we should release a reference count for * * If you successfully bump a reference count to a module with try_module_get(), * when you are finished you must call module_put() to release that reference * count. */ extern void module_put(struct module *module); #else /*!CONFIG_MODULE_UNLOAD*/ static inline bool try_module_get(struct module *module) { return !module || module_is_live(module); } static inline void module_put(struct module *module) { } static inline void __module_get(struct module *module) { } #define symbol_put(x) do { } while (0) #define symbol_put_addr(p) do { } while (0) #endif /* CONFIG_MODULE_UNLOAD */ /* This is a #define so the string doesn't get put in every .o file */ #define module_name(mod) \ ({ \ struct module *__mod = (mod); \ __mod ? __mod->name : "kernel"; \ }) /* Dereference module function descriptor */ void *dereference_module_function_descriptor(struct module *mod, void *ptr); int register_module_notifier(struct notifier_block *nb); int unregister_module_notifier(struct notifier_block *nb); extern void print_modules(void); static inline bool module_requested_async_probing(struct module *module) { return module && module->async_probe_requested; } static inline bool is_livepatch_module(struct module *mod) { #ifdef CONFIG_LIVEPATCH return mod->klp; #else return false; #endif } void set_module_sig_enforced(void); #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) { return NULL; } static inline struct module *__module_text_address(unsigned long addr) { return NULL; } static inline bool is_module_address(unsigned long addr) { return false; } static inline bool is_module_percpu_address(unsigned long addr) { return false; } static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) { return false; } static inline bool is_module_text_address(unsigned long addr) { return false; } static inline bool within_module_core(unsigned long addr, const struct module *mod) { return false; } static inline bool within_module_init(unsigned long addr, const struct module *mod) { return false; } static inline bool within_module(unsigned long addr, const struct module *mod) { return false; } /* Get/put a kernel symbol (calls should be symmetric) */ #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); }) #define symbol_put(x) do { } while (0) #define symbol_put_addr(x) do { } while (0) static inline void __module_get(struct module *module) { } static inline bool try_module_get(struct module *module) { return true; } static inline void module_put(struct module *module) { } #define module_name(mod) "kernel" static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ return 0; } static inline int unregister_module_notifier(struct notifier_block *nb) { return 0; } #define module_put_and_kthread_exit(code) kthread_exit(code) static inline void print_modules(void) { } static inline bool module_requested_async_probing(struct module *module) { return false; } static inline void set_module_sig_enforced(void) { } /* Dereference module function descriptor */ static inline void *dereference_module_function_descriptor(struct module *mod, void *ptr) { return ptr; } #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS extern struct kset *module_kset; extern const struct kobj_type module_ktype; #endif /* CONFIG_SYSFS */ #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x) /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */ #define __MODULE_STRING(x) __stringify(x) #ifdef CONFIG_GENERIC_BUG void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); void module_bug_cleanup(struct module *); #else /* !CONFIG_GENERIC_BUG */ static inline void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { } static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ #ifdef CONFIG_RETPOLINE extern bool retpoline_module_ok(bool has_retpoline); #else static inline bool retpoline_module_ok(bool has_retpoline) { return true; } #endif #ifdef CONFIG_MODULE_SIG bool is_module_sig_enforced(void); static inline bool module_sig_ok(struct module *module) { return module->sig_ok; } #else /* !CONFIG_MODULE_SIG */ static inline bool is_module_sig_enforced(void) { return false; } static inline bool module_sig_ok(struct module *module) { return true; } #endif /* CONFIG_MODULE_SIG */ #if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS) int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, unsigned long), void *data); /* For kallsyms to ask for address resolution. namebuf should be at * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ const char *module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf); int lookup_module_symbol_name(unsigned long addr, char *symname); int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if * symnum out of range. */ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported); /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); #else /* CONFIG_MODULES && CONFIG_KALLSYMS */ static inline int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, unsigned long), void *data) { return -EOPNOTSUPP; } /* For kallsyms to ask for address resolution. NULL means not found. */ static inline const char *module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { return NULL; } static inline int lookup_module_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) { return -ERANGE; } static inline unsigned long module_kallsyms_lookup_name(const char *name) { return 0; } static inline unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { return 0; } #endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ #endif /* _LINUX_MODULE_H */
4 1 1 1 2 1 1 1 1 1 1 1 1 1 3 1 1 3 1 1 1 3 1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 /* $Id: kcapi.c,v 1.1.2.8 2004/03/26 19:57:20 armin Exp $ * * Kernel CAPI 2.0 Module * * Copyright 1999 by Carsten Paeth <calle@calle.de> * Copyright 2002 by Kai Germaschewski <kai@germaschewski.name> * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. * */ #include "kcapi.h" #include <linux/module.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/ioport.h> #include <linux/proc_fs.h> #include <linux/sched/signal.h> #include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/workqueue.h> #include <linux/capi.h> #include <linux/kernelcapi.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/isdn/capicmd.h> #include <linux/isdn/capiutil.h> #include <linux/mutex.h> #include <linux/rcupdate.h> static int showcapimsgs; static struct workqueue_struct *kcapi_wq; module_param(showcapimsgs, uint, 0); /* ------------------------------------------------------------- */ struct capictr_event { struct work_struct work; unsigned int type; u32 controller; }; /* ------------------------------------------------------------- */ static const struct capi_version driver_version = {2, 0, 1, 1 << 4}; static char driver_serial[CAPI_SERIAL_LEN] = "0004711"; static char capi_manufakturer[64] = "AVM Berlin"; #define NCCI2CTRL(ncci) (((ncci) >> 24) & 0x7f) struct capi_ctr *capi_controller[CAPI_MAXCONTR]; DEFINE_MUTEX(capi_controller_lock); struct capi20_appl *capi_applications[CAPI_MAXAPPL]; static int ncontrollers; /* -------- controller ref counting -------------------------------------- */ static inline struct capi_ctr * capi_ctr_get(struct capi_ctr *ctr) { if (!try_module_get(ctr->owner)) return NULL; return ctr; } static inline void capi_ctr_put(struct capi_ctr *ctr) { module_put(ctr->owner); } /* ------------------------------------------------------------- */ static inline struct capi_ctr *get_capi_ctr_by_nr(u16 contr) { if (contr < 1 || contr - 1 >= CAPI_MAXCONTR) return NULL; return capi_controller[contr - 1]; } static inline struct capi20_appl *__get_capi_appl_by_nr(u16 applid) { lockdep_assert_held(&capi_controller_lock); if (applid < 1 || applid - 1 >= CAPI_MAXAPPL) return NULL; return capi_applications[applid - 1]; } static inline struct capi20_appl *get_capi_appl_by_nr(u16 applid) { if (applid < 1 || applid - 1 >= CAPI_MAXAPPL) return NULL; return rcu_dereference(capi_applications[applid - 1]); } /* -------- util functions ------------------------------------ */ static inline int capi_cmd_valid(u8 cmd) { switch (cmd) { case CAPI_ALERT: case CAPI_CONNECT: case CAPI_CONNECT_ACTIVE: case CAPI_CONNECT_B3_ACTIVE: case CAPI_CONNECT_B3: case CAPI_CONNECT_B3_T90_ACTIVE: case CAPI_DATA_B3: case CAPI_DISCONNECT_B3: case CAPI_DISCONNECT: case CAPI_FACILITY: case CAPI_INFO: case CAPI_LISTEN: case CAPI_MANUFACTURER: case CAPI_RESET_B3: case CAPI_SELECT_B_PROTOCOL: return 1; } return 0; } static inline int capi_subcmd_valid(u8 subcmd) { switch (subcmd) { case CAPI_REQ: case CAPI_CONF: case CAPI_IND: case CAPI_RESP: return 1; } return 0; } /* ------------------------------------------------------------ */ static void register_appl(struct capi_ctr *ctr, u16 applid, capi_register_params *rparam) { ctr = capi_ctr_get(ctr); if (ctr) ctr->register_appl(ctr, applid, rparam); else printk(KERN_WARNING "%s: cannot get controller resources\n", __func__); } static void release_appl(struct capi_ctr *ctr, u16 applid) { DBG("applid %#x", applid); ctr->release_appl(ctr, applid); capi_ctr_put(ctr); } static void notify_up(u32 contr) { struct capi20_appl *ap; struct capi_ctr *ctr; u16 applid; mutex_lock(&capi_controller_lock); if (showcapimsgs & 1) printk(KERN_DEBUG "kcapi: notify up contr %d\n", contr); ctr = get_capi_ctr_by_nr(contr); if (ctr) { if (ctr->state == CAPI_CTR_RUNNING) goto unlock_out; ctr->state = CAPI_CTR_RUNNING; for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { ap = __get_capi_appl_by_nr(applid); if (ap) register_appl(ctr, applid, &ap->rparam); } } else printk(KERN_WARNING "%s: invalid contr %d\n", __func__, contr); unlock_out: mutex_unlock(&capi_controller_lock); } static void ctr_down(struct capi_ctr *ctr, int new_state) { struct capi20_appl *ap; u16 applid; if (ctr->state == CAPI_CTR_DETECTED || ctr->state == CAPI_CTR_DETACHED) return; ctr->state = new_state; memset(ctr->manu, 0, sizeof(ctr->manu)); memset(&ctr->version, 0, sizeof(ctr->version)); memset(&ctr->profile, 0, sizeof(ctr->profile)); memset(ctr->serial, 0, sizeof(ctr->serial)); for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { ap = __get_capi_appl_by_nr(applid); if (ap) capi_ctr_put(ctr); } } static void notify_down(u32 contr) { struct capi_ctr *ctr; mutex_lock(&capi_controller_lock); if (showcapimsgs & 1) printk(KERN_DEBUG "kcapi: notify down contr %d\n", contr); ctr = get_capi_ctr_by_nr(contr); if (ctr) ctr_down(ctr, CAPI_CTR_DETECTED); else printk(KERN_WARNING "%s: invalid contr %d\n", __func__, contr); mutex_unlock(&capi_controller_lock); } static void do_notify_work(struct work_struct *work) { struct capictr_event *event = container_of(work, struct capictr_event, work); switch (event->type) { case CAPICTR_UP: notify_up(event->controller); break; case CAPICTR_DOWN: notify_down(event->controller); break; } kfree(event); } static int notify_push(unsigned int event_type, u32 controller) { struct capictr_event *event = kmalloc(sizeof(*event), GFP_ATOMIC); if (!event) return -ENOMEM; INIT_WORK(&event->work, do_notify_work); event->type = event_type; event->controller = controller; queue_work(kcapi_wq, &event->work); return 0; } /* -------- Receiver ------------------------------------------ */ static void recv_handler(struct work_struct *work) { struct sk_buff *skb; struct capi20_appl *ap = container_of(work, struct capi20_appl, recv_work); if ((!ap) || (ap->release_in_progress)) return; mutex_lock(&ap->recv_mtx); while ((skb = skb_dequeue(&ap->recv_queue))) { if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_IND) ap->nrecvdatapkt++; else ap->nrecvctlpkt++; ap->recv_message(ap, skb); } mutex_unlock(&ap->recv_mtx); } /** * capi_ctr_handle_message() - handle incoming CAPI message * @ctr: controller descriptor structure. * @appl: application ID. * @skb: message. * * Called by hardware driver to pass a CAPI message to the application. */ void capi_ctr_handle_message(struct capi_ctr *ctr, u16 appl, struct sk_buff *skb) { struct capi20_appl *ap; int showctl = 0; u8 cmd, subcmd; _cdebbuf *cdb; if (ctr->state != CAPI_CTR_RUNNING) { cdb = capi_message2str(skb->data); if (cdb) { printk(KERN_INFO "kcapi: controller [%03d] not active, got: %s", ctr->cnr, cdb->buf); cdebbuf_free(cdb); } else printk(KERN_INFO "kcapi: controller [%03d] not active, cannot trace\n", ctr->cnr); goto error; } cmd = CAPIMSG_COMMAND(skb->data); subcmd = CAPIMSG_SUBCOMMAND(skb->data); if (cmd == CAPI_DATA_B3 && subcmd == CAPI_IND) { ctr->nrecvdatapkt++; if (ctr->traceflag > 2) showctl |= 2; } else { ctr->nrecvctlpkt++; if (ctr->traceflag) showctl |= 2; } showctl |= (ctr->traceflag & 1); if (showctl & 2) { if (showctl & 1) { printk(KERN_DEBUG "kcapi: got [%03d] id#%d %s len=%u\n", ctr->cnr, CAPIMSG_APPID(skb->data), capi_cmd2str(cmd, subcmd), CAPIMSG_LEN(skb->data)); } else { cdb = capi_message2str(skb->data); if (cdb) { printk(KERN_DEBUG "kcapi: got [%03d] %s\n", ctr->cnr, cdb->buf); cdebbuf_free(cdb); } else printk(KERN_DEBUG "kcapi: got [%03d] id#%d %s len=%u, cannot trace\n", ctr->cnr, CAPIMSG_APPID(skb->data), capi_cmd2str(cmd, subcmd), CAPIMSG_LEN(skb->data)); } } rcu_read_lock(); ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data)); if (!ap) { rcu_read_unlock(); cdb = capi_message2str(skb->data); if (cdb) { printk(KERN_ERR "kcapi: handle_message: applid %d state released (%s)\n", CAPIMSG_APPID(skb->data), cdb->buf); cdebbuf_free(cdb); } else printk(KERN_ERR "kcapi: handle_message: applid %d state released (%s) cannot trace\n", CAPIMSG_APPID(skb->data), capi_cmd2str(cmd, subcmd)); goto error; } skb_queue_tail(&ap->recv_queue, skb); queue_work(kcapi_wq, &ap->recv_work); rcu_read_unlock(); return; error: kfree_skb(skb); } EXPORT_SYMBOL(capi_ctr_handle_message); /** * capi_ctr_ready() - signal CAPI controller ready * @ctr: controller descriptor structure. * * Called by hardware driver to signal that the controller is up and running. */ void capi_ctr_ready(struct capi_ctr *ctr) { printk(KERN_NOTICE "kcapi: controller [%03d] \"%s\" ready.\n", ctr->cnr, ctr->name); notify_push(CAPICTR_UP, ctr->cnr); } EXPORT_SYMBOL(capi_ctr_ready); /** * capi_ctr_down() - signal CAPI controller not ready * @ctr: controller descriptor structure. * * Called by hardware driver to signal that the controller is down and * unavailable for use. */ void capi_ctr_down(struct capi_ctr *ctr) { printk(KERN_NOTICE "kcapi: controller [%03d] down.\n", ctr->cnr); notify_push(CAPICTR_DOWN, ctr->cnr); } EXPORT_SYMBOL(capi_ctr_down); /* ------------------------------------------------------------- */ /** * attach_capi_ctr() - register CAPI controller * @ctr: controller descriptor structure. * * Called by hardware driver to register a controller with the CAPI subsystem. * Return value: 0 on success, error code < 0 on error */ int attach_capi_ctr(struct capi_ctr *ctr) { int i; mutex_lock(&capi_controller_lock); for (i = 0; i < CAPI_MAXCONTR; i++) { if (!capi_controller[i]) break; } if (i == CAPI_MAXCONTR) { mutex_unlock(&capi_controller_lock); printk(KERN_ERR "kcapi: out of controller slots\n"); return -EBUSY; } capi_controller[i] = ctr; ctr->nrecvctlpkt = 0; ctr->nrecvdatapkt = 0; ctr->nsentctlpkt = 0; ctr->nsentdatapkt = 0; ctr->cnr = i + 1; ctr->state = CAPI_CTR_DETECTED; ctr->blocked = 0; ctr->traceflag = showcapimsgs; sprintf(ctr->procfn, "capi/controllers/%d", ctr->cnr); ctr->procent = proc_create_single_data(ctr->procfn, 0, NULL, ctr->proc_show, ctr); ncontrollers++; mutex_unlock(&capi_controller_lock); printk(KERN_NOTICE "kcapi: controller [%03d]: %s attached\n", ctr->cnr, ctr->name); return 0; } EXPORT_SYMBOL(attach_capi_ctr); /** * detach_capi_ctr() - unregister CAPI controller * @ctr: controller descriptor structure. * * Called by hardware driver to remove the registration of a controller * with the CAPI subsystem. * Return value: 0 on success, error code < 0 on error */ int detach_capi_ctr(struct capi_ctr *ctr) { int err = 0; mutex_lock(&capi_controller_lock); ctr_down(ctr, CAPI_CTR_DETACHED); if (ctr->cnr < 1 || ctr->cnr - 1 >= CAPI_MAXCONTR) { err = -EINVAL; goto unlock_out; } if (capi_controller[ctr->cnr - 1] != ctr) { err = -EINVAL; goto unlock_out; } capi_controller[ctr->cnr - 1] = NULL; ncontrollers--; if (ctr->procent) remove_proc_entry(ctr->procfn, NULL); printk(KERN_NOTICE "kcapi: controller [%03d]: %s unregistered\n", ctr->cnr, ctr->name); unlock_out: mutex_unlock(&capi_controller_lock); return err; } EXPORT_SYMBOL(detach_capi_ctr); /* ------------------------------------------------------------- */ /* -------- CAPI2.0 Interface ---------------------------------- */ /* ------------------------------------------------------------- */ /** * capi20_isinstalled() - CAPI 2.0 operation CAPI_INSTALLED * * Return value: CAPI result code (CAPI_NOERROR if at least one ISDN controller * is ready for use, CAPI_REGNOTINSTALLED otherwise) */ u16 capi20_isinstalled(void) { u16 ret = CAPI_REGNOTINSTALLED; int i; mutex_lock(&capi_controller_lock); for (i = 0; i < CAPI_MAXCONTR; i++) if (capi_controller[i] && capi_controller[i]->state == CAPI_CTR_RUNNING) { ret = CAPI_NOERROR; break; } mutex_unlock(&capi_controller_lock); return ret; } /** * capi20_register() - CAPI 2.0 operation CAPI_REGISTER * @ap: CAPI application descriptor structure. * * Register an application's presence with CAPI. * A unique application ID is assigned and stored in @ap->applid. * After this function returns successfully, the message receive * callback function @ap->recv_message() may be called at any time * until capi20_release() has been called for the same @ap. * Return value: CAPI result code */ u16 capi20_register(struct capi20_appl *ap) { int i; u16 applid; DBG(""); if (ap->rparam.datablklen < 128) return CAPI_LOGBLKSIZETOSMALL; ap->nrecvctlpkt = 0; ap->nrecvdatapkt = 0; ap->nsentctlpkt = 0; ap->nsentdatapkt = 0; mutex_init(&ap->recv_mtx); skb_queue_head_init(&ap->recv_queue); INIT_WORK(&ap->recv_work, recv_handler); ap->release_in_progress = 0; mutex_lock(&capi_controller_lock); for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { if (capi_applications[applid - 1] == NULL) break; } if (applid > CAPI_MAXAPPL) { mutex_unlock(&capi_controller_lock); return CAPI_TOOMANYAPPLS; } ap->applid = applid; capi_applications[applid - 1] = ap; for (i = 0; i < CAPI_MAXCONTR; i++) { if (!capi_controller[i] || capi_controller[i]->state != CAPI_CTR_RUNNING) continue; register_appl(capi_controller[i], applid, &ap->rparam); } mutex_unlock(&capi_controller_lock); if (showcapimsgs & 1) { printk(KERN_DEBUG "kcapi: appl %d up\n", applid); } return CAPI_NOERROR; } /** * capi20_release() - CAPI 2.0 operation CAPI_RELEASE * @ap: CAPI application descriptor structure. * * Terminate an application's registration with CAPI. * After this function returns successfully, the message receive * callback function @ap->recv_message() will no longer be called. * Return value: CAPI result code */ u16 capi20_release(struct capi20_appl *ap) { int i; DBG("applid %#x", ap->applid); mutex_lock(&capi_controller_lock); ap->release_in_progress = 1; capi_applications[ap->applid - 1] = NULL; synchronize_rcu(); for (i = 0; i < CAPI_MAXCONTR; i++) { if (!capi_controller[i] || capi_controller[i]->state != CAPI_CTR_RUNNING) continue; release_appl(capi_controller[i], ap->applid); } mutex_unlock(&capi_controller_lock); flush_workqueue(kcapi_wq); skb_queue_purge(&ap->recv_queue); if (showcapimsgs & 1) { printk(KERN_DEBUG "kcapi: appl %d down\n", ap->applid); } return CAPI_NOERROR; } /** * capi20_put_message() - CAPI 2.0 operation CAPI_PUT_MESSAGE * @ap: CAPI application descriptor structure. * @skb: CAPI message. * * Transfer a single message to CAPI. * Return value: CAPI result code */ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb) { struct capi_ctr *ctr; int showctl = 0; u8 cmd, subcmd; DBG("applid %#x", ap->applid); if (ncontrollers == 0) return CAPI_REGNOTINSTALLED; if ((ap->applid == 0) || ap->release_in_progress) return CAPI_ILLAPPNR; if (skb->len < 12 || !capi_cmd_valid(CAPIMSG_COMMAND(skb->data)) || !capi_subcmd_valid(CAPIMSG_SUBCOMMAND(skb->data))) return CAPI_ILLCMDORSUBCMDORMSGTOSMALL; /* * The controller reference is protected by the existence of the * application passed to us. We assume that the caller properly * synchronizes this service with capi20_release. */ ctr = get_capi_ctr_by_nr(CAPIMSG_CONTROLLER(skb->data)); if (!ctr || ctr->state != CAPI_CTR_RUNNING) return CAPI_REGNOTINSTALLED; if (ctr->blocked) return CAPI_SENDQUEUEFULL; cmd = CAPIMSG_COMMAND(skb->data); subcmd = CAPIMSG_SUBCOMMAND(skb->data); if (cmd == CAPI_DATA_B3 && subcmd == CAPI_REQ) { ctr->nsentdatapkt++; ap->nsentdatapkt++; if (ctr->traceflag > 2) showctl |= 2; } else { ctr->nsentctlpkt++; ap->nsentctlpkt++; if (ctr->traceflag) showctl |= 2; } showctl |= (ctr->traceflag & 1); if (showctl & 2) { if (showctl & 1) { printk(KERN_DEBUG "kcapi: put [%03d] id#%d %s len=%u\n", CAPIMSG_CONTROLLER(skb->data), CAPIMSG_APPID(skb->data), capi_cmd2str(cmd, subcmd), CAPIMSG_LEN(skb->data)); } else { _cdebbuf *cdb = capi_message2str(skb->data); if (cdb) { printk(KERN_DEBUG "kcapi: put [%03d] %s\n", CAPIMSG_CONTROLLER(skb->data), cdb->buf); cdebbuf_free(cdb); } else printk(KERN_DEBUG "kcapi: put [%03d] id#%d %s len=%u cannot trace\n", CAPIMSG_CONTROLLER(skb->data), CAPIMSG_APPID(skb->data), capi_cmd2str(cmd, subcmd), CAPIMSG_LEN(skb->data)); } } return ctr->send_message(ctr, skb); } /** * capi20_get_manufacturer() - CAPI 2.0 operation CAPI_GET_MANUFACTURER * @contr: controller number. * @buf: result buffer (64 bytes). * * Retrieve information about the manufacturer of the specified ISDN controller * or (for @contr == 0) the driver itself. * Return value: CAPI result code */ u16 capi20_get_manufacturer(u32 contr, u8 buf[CAPI_MANUFACTURER_LEN]) { struct capi_ctr *ctr; u16 ret; if (contr == 0) { strscpy_pad(buf, capi_manufakturer, CAPI_MANUFACTURER_LEN); return CAPI_NOERROR; } mutex_lock(&capi_controller_lock); ctr = get_capi_ctr_by_nr(contr); if (ctr && ctr->state == CAPI_CTR_RUNNING) { strscpy_pad(buf, ctr->manu, CAPI_MANUFACTURER_LEN); ret = CAPI_NOERROR; } else ret = CAPI_REGNOTINSTALLED; mutex_unlock(&capi_controller_lock); return ret; } /** * capi20_get_version() - CAPI 2.0 operation CAPI_GET_VERSION * @contr: controller number. * @verp: result structure. * * Retrieve version information for the specified ISDN controller * or (for @contr == 0) the driver itself. * Return value: CAPI result code */ u16 capi20_get_version(u32 contr, struct capi_version *verp) { struct capi_ctr *ctr; u16 ret; if (contr == 0) { *verp = driver_version; return CAPI_NOERROR; } mutex_lock(&capi_controller_lock); ctr = get_capi_ctr_by_nr(contr); if (ctr && ctr->state == CAPI_CTR_RUNNING) { memcpy(verp, &ctr->version, sizeof(capi_version)); ret = CAPI_NOERROR; } else ret = CAPI_REGNOTINSTALLED; mutex_unlock(&capi_controller_lock); return ret; } /** * capi20_get_serial() - CAPI 2.0 operation CAPI_GET_SERIAL_NUMBER * @contr: controller number. * @serial: result buffer (8 bytes). * * Retrieve the serial number of the specified ISDN controller * or (for @contr == 0) the driver itself. * Return value: CAPI result code */ u16 capi20_get_serial(u32 contr, u8 serial[CAPI_SERIAL_LEN]) { struct capi_ctr *ctr; u16 ret; if (contr == 0) { strscpy(serial, driver_serial, CAPI_SERIAL_LEN); return CAPI_NOERROR; } mutex_lock(&capi_controller_lock); ctr = get_capi_ctr_by_nr(contr); if (ctr && ctr->state == CAPI_CTR_RUNNING) { strscpy(serial, ctr->serial, CAPI_SERIAL_LEN); ret = CAPI_NOERROR; } else ret = CAPI_REGNOTINSTALLED; mutex_unlock(&capi_controller_lock); return ret; } /** * capi20_get_profile() - CAPI 2.0 operation CAPI_GET_PROFILE * @contr: controller number. * @profp: result structure. * * Retrieve capability information for the specified ISDN controller * or (for @contr == 0) the number of installed controllers. * Return value: CAPI result code */ u16 capi20_get_profile(u32 contr, struct capi_profile *profp) { struct capi_ctr *ctr; u16 ret; if (contr == 0) { profp->ncontroller = ncontrollers; return CAPI_NOERROR; } mutex_lock(&capi_controller_lock); ctr = get_capi_ctr_by_nr(contr); if (ctr && ctr->state == CAPI_CTR_RUNNING) { memcpy(profp, &ctr->profile, sizeof(struct capi_profile)); ret = CAPI_NOERROR; } else ret = CAPI_REGNOTINSTALLED; mutex_unlock(&capi_controller_lock); return ret; } /** * capi20_manufacturer() - CAPI 2.0 operation CAPI_MANUFACTURER * @cmd: command. * @data: parameter. * * Perform manufacturer specific command. * Return value: CAPI result code */ int capi20_manufacturer(unsigned long cmd, void __user *data) { struct capi_ctr *ctr; int retval; switch (cmd) { case KCAPI_CMD_TRACE: { kcapi_flagdef fdef; if (copy_from_user(&fdef, data, sizeof(kcapi_flagdef))) return -EFAULT; mutex_lock(&capi_controller_lock); ctr = get_capi_ctr_by_nr(fdef.contr); if (ctr) { ctr->traceflag = fdef.flag; printk(KERN_INFO "kcapi: contr [%03d] set trace=%d\n", ctr->cnr, ctr->traceflag); retval = 0; } else retval = -ESRCH; mutex_unlock(&capi_controller_lock); return retval; } default: printk(KERN_ERR "kcapi: manufacturer command %lu unknown.\n", cmd); break; } return -EINVAL; } /* ------------------------------------------------------------- */ /* -------- Init & Cleanup ------------------------------------- */ /* ------------------------------------------------------------- */ /* * init / exit functions */ int __init kcapi_init(void) { int err; kcapi_wq = alloc_workqueue("kcapi", 0, 0); if (!kcapi_wq) return -ENOMEM; err = cdebug_init(); if (err) { destroy_workqueue(kcapi_wq); return err; } kcapi_proc_init(); return 0; } void kcapi_exit(void) { kcapi_proc_exit(); cdebug_exit(); destroy_workqueue(kcapi_wq); }
6 1 1 1 1 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 // SPDX-License-Identifier: GPL-2.0-only /* * iptables module to match inet_addr_type() of an ip. * * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/ip.h> #include <net/route.h> #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip6_fib.h> #endif #include <linux/netfilter_ipv6.h> #include <linux/netfilter/xt_addrtype.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: address type match"); MODULE_ALIAS("ipt_addrtype"); MODULE_ALIAS("ip6t_addrtype"); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, const struct in6_addr *addr, u16 mask) { struct flowi6 flow; struct rt6_info *rt; u32 ret = 0; int route_err; memset(&flow, 0, sizeof(flow)); flow.daddr = *addr; if (dev) flow.flowi6_oif = dev->ifindex; if (dev && (mask & XT_ADDRTYPE_LOCAL)) { if (nf_ipv6_chk_addr(net, addr, dev, true)) ret = XT_ADDRTYPE_LOCAL; } route_err = nf_ip6_route(net, (struct dst_entry **)&rt, flowi6_to_flowi(&flow), false); if (route_err) return XT_ADDRTYPE_UNREACHABLE; if (rt->rt6i_flags & RTF_REJECT) ret = XT_ADDRTYPE_UNREACHABLE; if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; if (ipv6_anycast_destination((struct dst_entry *)rt, addr)) ret |= XT_ADDRTYPE_ANYCAST; dst_release(&rt->dst); return ret; } static bool match_type6(struct net *net, const struct net_device *dev, const struct in6_addr *addr, u16 mask) { int addr_type = ipv6_addr_type(addr); if ((mask & XT_ADDRTYPE_MULTICAST) && !(addr_type & IPV6_ADDR_MULTICAST)) return false; if ((mask & XT_ADDRTYPE_UNICAST) && !(addr_type & IPV6_ADDR_UNICAST)) return false; if ((mask & XT_ADDRTYPE_UNSPEC) && addr_type != IPV6_ADDR_ANY) return false; if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | XT_ADDRTYPE_UNREACHABLE) & mask) return !!(mask & match_lookup_rt6(net, dev, addr, mask)); return true; } static bool addrtype_mt6(struct net *net, const struct net_device *dev, const struct sk_buff *skb, const struct xt_addrtype_info_v1 *info) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool ret = true; if (info->source) ret &= match_type6(net, dev, &iph->saddr, info->source) ^ (info->flags & XT_ADDRTYPE_INVERT_SOURCE); if (ret && info->dest) ret &= match_type6(net, dev, &iph->daddr, info->dest) ^ !!(info->flags & XT_ADDRTYPE_INVERT_DEST); return ret; } #endif static inline bool match_type(struct net *net, const struct net_device *dev, __be32 addr, u_int16_t mask) { return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); } static bool addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_addrtype_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool ret = true; if (info->source) ret &= match_type(net, NULL, iph->saddr, info->source) ^ info->invert_source; if (info->dest) ret &= match_type(net, NULL, iph->daddr, info->dest) ^ info->invert_dest; return ret; } static bool addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_addrtype_info_v1 *info = par->matchinfo; const struct iphdr *iph; const struct net_device *dev = NULL; bool ret = true; if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) dev = xt_in(par); else if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) dev = xt_out(par); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (xt_family(par) == NFPROTO_IPV6) return addrtype_mt6(net, dev, skb, info); #endif iph = ip_hdr(skb); if (info->source) ret &= match_type(net, dev, iph->saddr, info->source) ^ (info->flags & XT_ADDRTYPE_INVERT_SOURCE); if (ret && info->dest) ret &= match_type(net, dev, iph->daddr, info->dest) ^ !!(info->flags & XT_ADDRTYPE_INVERT_DEST); return ret; } static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) { const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; struct xt_addrtype_info_v1 *info = par->matchinfo; if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) goto err; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) { errmsg = "output interface limitation not valid in PREROUTING and INPUT"; goto err; } if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN) { errmsg = "input interface limitation not valid in POSTROUTING and OUTPUT"; goto err; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (par->family == NFPROTO_IPV6) { if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { errmsg = "ipv6 BLACKHOLE matching not supported"; goto err; } if ((info->source | info->dest) >= XT_ADDRTYPE_PROHIBIT) { errmsg = "ipv6 PROHIBIT (THROW, NAT ..) matching not supported"; goto err; } if ((info->source | info->dest) & XT_ADDRTYPE_BROADCAST) { errmsg = "ipv6 does not support BROADCAST matching"; goto err; } } #endif return 0; err: pr_info_ratelimited("%s\n", errmsg); return -EINVAL; } static struct xt_match addrtype_mt_reg[] __read_mostly = { { .name = "addrtype", .family = NFPROTO_IPV4, .match = addrtype_mt_v0, .matchsize = sizeof(struct xt_addrtype_info), .me = THIS_MODULE }, { .name = "addrtype", .family = NFPROTO_UNSPEC, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE } }; static int __init addrtype_mt_init(void) { return xt_register_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); } static void __exit addrtype_mt_exit(void) { xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); } module_init(addrtype_mt_init); module_exit(addrtype_mt_exit);
32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_H #define _ASM_X86_UACCESS_H /* * User space memory access functions */ #include <linux/compiler.h> #include <linux/instrumented.h> #include <linux/kasan-checks.h> #include <linux/mm_types.h> #include <linux/string.h> #include <linux/mmap_lock.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> #include <asm/tlbflush.h> #ifdef CONFIG_X86_32 # include <asm/uaccess_32.h> #else # include <asm/uaccess_64.h> #endif #include <asm-generic/access_ok.h> extern int __get_user_1(void); extern int __get_user_2(void); extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_nocheck_1(void); extern int __get_user_nocheck_2(void); extern int __get_user_nocheck_4(void); extern int __get_user_nocheck_8(void); extern int __get_user_bad(void); #define __uaccess_begin() stac() #define __uaccess_end() clac() #define __uaccess_begin_nospec() \ ({ \ stac(); \ barrier_nospec(); \ }) /* * This is the smallest unsigned integer type that can fit a value * (up to 'long long') */ #define __inttype(x) __typeof__( \ __typefits(x,char, \ __typefits(x,short, \ __typefits(x,int, \ __typefits(x,long,0ULL))))) #define __typefits(x,type,not) \ __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not) /* * This is used for both get_user() and __get_user() to expand to * the proper special function call that has odd calling conventions * due to returning both a value and an error, and that depends on * the size of the pointer passed in. * * Careful: we have to cast the result to the type of the pointer * for sign reasons. * * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. * * Clang/LLVM cares about the size of the register, but still wants * the base register for something that ends up being a pair. */ #define do_get_user_call(fn,x,ptr) \ ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ asm volatile("call __" #fn "_%P4" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ instrument_get_user(__val_gu); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ }) /** * get_user - Get a simple variable from user space. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); }) /** * __get_user - Get a simple variable from user space, with less checking. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Caller must check the pointer with access_ok() before calling this * function. * * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr) #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ asm_volatile_goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ _ASM_EXTABLE_UA(2b, %l2) \ : : "A" (x), "r" (addr) \ : : label) #else #define __put_user_goto_u64(x, ptr, label) \ __put_user_goto(x, ptr, "q", "er", label) #endif extern void __put_user_bad(void); /* * Strange magic calling convention: pointer in %ecx, * value in %eax(:%edx), return value in %ecx. clobbers %rbx */ extern void __put_user_1(void); extern void __put_user_2(void); extern void __put_user_4(void); extern void __put_user_8(void); extern void __put_user_nocheck_1(void); extern void __put_user_nocheck_2(void); extern void __put_user_nocheck_4(void); extern void __put_user_nocheck_8(void); /* * ptr must be evaluated and assigned to the temporary __ptr_pu before * the assignment of x to __val_pu, to avoid any function calls * involved in the ptr expression (possibly implicitly generated due * to KASAN) from clobbering %ax. */ #define do_put_user_call(fn,x,ptr) \ ({ \ int __ret_pu; \ void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ __typeof__(*(ptr)) __x = (x); /* eval x once */ \ __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ __chk_user_ptr(__ptr); \ __ptr_pu = __ptr; \ __val_pu = __x; \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ : "0" (__ptr_pu), \ "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ instrument_put_user(__x, __ptr, sizeof(*(ptr))); \ __builtin_expect(__ret_pu, 0); \ }) /** * put_user - Write a simple value into user space. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Return: zero on success, or -EFAULT on error. */ #define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); }) /** * __put_user - Write a simple value into user space, with less checking. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Caller must check the pointer with access_ok() before calling this * function. * * Return: zero on success, or -EFAULT on error. */ #define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr) #define __put_user_size(x, ptr, size, label) \ do { \ __typeof__(*(ptr)) __x = (x); /* eval x once */ \ __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ __chk_user_ptr(__ptr); \ switch (size) { \ case 1: \ __put_user_goto(__x, __ptr, "b", "iq", label); \ break; \ case 2: \ __put_user_goto(__x, __ptr, "w", "ir", label); \ break; \ case 4: \ __put_user_goto(__x, __ptr, "l", "ir", label); \ break; \ case 8: \ __put_user_goto_u64(__x, __ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ instrument_put_user(__x, __ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, label) do { \ unsigned int __gu_low, __gu_high; \ const unsigned int __user *__gu_ptr; \ __gu_ptr = (const void __user *)(ptr); \ __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \ __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \ (x) = ((unsigned long long)__gu_high << 32) | __gu_low; \ } while (0) #else #define __get_user_asm_u64(x, ptr, label) \ __get_user_asm(x, ptr, "q", "=r", label) #endif #define __get_user_size(x, ptr, size, label) \ do { \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: { \ unsigned char x_u8__; \ __get_user_asm(x_u8__, ptr, "b", "=q", label); \ (x) = x_u8__; \ break; \ } \ case 2: \ __get_user_asm(x, ptr, "w", "=r", label); \ break; \ case 4: \ __get_user_asm(x, ptr, "l", "=r", label); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, label); \ break; \ default: \ (x) = __get_user_bad(); \ } \ instrument_get_user(x); \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ asm_volatile_goto("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ : [umem] "m" (__m(addr)) \ : : label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ asm volatile("\n" \ "1: movl %[lowbits],%%eax\n" \ "2: movl %[highbits],%%edx\n" \ "3:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG | \ EX_FLAG_CLEAR_AX_DX, \ %[errout]) \ _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG | \ EX_FLAG_CLEAR_AX_DX, \ %[errout]) \ : [errout] "=r" (retval), \ [output] "=&A"(x) \ : [lowbits] "m" (__m(__ptr)), \ [highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \ "0" (retval)); \ }) #else #define __get_user_asm_u64(x, ptr, retval) \ __get_user_asm(x, ptr, retval, "q") #endif #define __get_user_size(x, ptr, size, retval) \ do { \ unsigned char x_u8__; \ \ retval = 0; \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ __get_user_asm(x_u8__, ptr, retval, "b"); \ (x) = x_u8__; \ break; \ case 2: \ __get_user_asm(x, ptr, retval, "w"); \ break; \ case 4: \ __get_user_asm(x, ptr, retval, "l"); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, retval); \ break; \ default: \ (x) = __get_user_bad(); \ } \ } while (0) #define __get_user_asm(x, addr, err, itype) \ asm volatile("\n" \ "1: mov"itype" %[umem],%[output]\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \ EX_FLAG_CLEAR_AX, \ %[errout]) \ : [errout] "=r" (err), \ [output] "=a" (x) \ : [umem] "m" (__m(addr)), \ "0" (err)) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT #define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \ bool success; \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ asm_volatile_goto("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ [ptr] "+m" (*_ptr), \ [old] "+a" (__old) \ : [new] ltype (__new) \ : "memory" \ : label); \ if (unlikely(!success)) \ *_old = __old; \ likely(success); }) #ifdef CONFIG_X86_32 #define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \ bool success; \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ asm_volatile_goto("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ "+A" (__old), \ [ptr] "+m" (*_ptr) \ : "b" ((u32)__new), \ "c" ((u32)((u64)__new >> 32)) \ : "memory" \ : label); \ if (unlikely(!success)) \ *_old = __old; \ likely(success); }) #endif // CONFIG_X86_32 #else // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT #define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \ int __err = 0; \ bool success; \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ asm volatile("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ CC_SET(z) \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \ %[errout]) \ : CC_OUT(z) (success), \ [errout] "+r" (__err), \ [ptr] "+m" (*_ptr), \ [old] "+a" (__old) \ : [new] ltype (__new) \ : "memory"); \ if (unlikely(__err)) \ goto label; \ if (unlikely(!success)) \ *_old = __old; \ likely(success); }) #ifdef CONFIG_X86_32 /* * Unlike the normal CMPXCHG, use output GPR for both success/fail and error. * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses * both ESI and EDI for the memory operand, compilation will fail if the error * is an input+output as there will be no register available for input. */ #define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \ int __result; \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ asm volatile("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ "mov $0, %[result]\n\t" \ "setz %b[result]\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \ %[result]) \ : [result] "=q" (__result), \ "+A" (__old), \ [ptr] "+m" (*_ptr) \ : "b" ((u32)__new), \ "c" ((u32)((u64)__new >> 32)) \ : "memory", "cc"); \ if (unlikely(__result < 0)) \ goto label; \ if (unlikely(!__result)) \ *_old = __old; \ likely(__result); }) #endif // CONFIG_X86_32 #endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; #define __m(x) (*(struct __large_struct __user *)(x)) /* * Tell gcc we read from memory instead of writing: this is because * we do not write to any memory gcc knows about, so there are no * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ asm_volatile_goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ : : label) extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); #ifdef CONFIG_ARCH_HAS_COPY_MC unsigned long __must_check copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check copy_mc_to_user(void __user *to, const void *from, unsigned len); #endif /* * movsl can be slow when source and dest are not both 8-byte aligned */ #ifdef CONFIG_X86_INTEL_USERCOPY extern struct movsl_mask { int mask; } ____cacheline_aligned_in_smp movsl_mask; #endif #define ARCH_HAS_NOCACHE_UACCESS 1 /* * The "unsafe" user accesses aren't really "unsafe", but the naming * is a big fat warning: you have to not only do the access_ok() * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { if (unlikely(!access_ok(ptr,len))) return 0; __uaccess_begin_nospec(); return 1; } #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() #define user_access_save() smap_save() #define user_access_restore(x) smap_restore(x) #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ if (unlikely(__gu_err)) goto err_label; \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT extern void __try_cmpxchg_user_wrong_size(void); #ifndef CONFIG_X86_32 #define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label) \ __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label) #endif /* * Force the pointer to u<size> to match the size expected by the asm helper. * clang/LLVM compiles all cases and only discards the unused paths after * processing errors, which breaks i386 if the pointer is an 8-byte value. */ #define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \ bool __ret; \ __chk_user_ptr(_ptr); \ switch (sizeof(*(_ptr))) { \ case 1: __ret = __try_cmpxchg_user_asm("b", "q", \ (__force u8 *)(_ptr), (_oldp), \ (_nval), _label); \ break; \ case 2: __ret = __try_cmpxchg_user_asm("w", "r", \ (__force u16 *)(_ptr), (_oldp), \ (_nval), _label); \ break; \ case 4: __ret = __try_cmpxchg_user_asm("l", "r", \ (__force u32 *)(_ptr), (_oldp), \ (_nval), _label); \ break; \ case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\ (_nval), _label); \ break; \ default: __try_cmpxchg_user_wrong_size(); \ } \ __ret; }) /* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */ #define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \ int __ret = -EFAULT; \ __uaccess_begin_nospec(); \ __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label); \ _label: \ __uaccess_end(); \ __ret; \ }) /* * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. */ #define unsafe_copy_loop(dst, src, len, type, label) \ while (len >= sizeof(type)) { \ unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } #define unsafe_copy_to_user(_dst,_src,_len,label) \ do { \ char __user *__ucu_dst = (_dst); \ const char *__ucu_src = (_src); \ size_t __ucu_len = (_len); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), err_label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ int __kr_err; \ \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), __kr_err); \ if (unlikely(__kr_err)) \ goto err_label; \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __put_kernel_nofault(dst, src, type, err_label) \ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \ sizeof(type), err_label) #endif /* _ASM_X86_UACCESS_H */
383 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_NS_H #define _LINUX_PID_NS_H #include <linux/sched.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/workqueue.h> #include <linux/threads.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/idr.h> /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) /* modes for vm.memfd_noexec sysctl */ #define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { struct idr idr; struct rcu_head rcu; unsigned int pid_allocated; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; struct pid_namespace *parent; #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; #endif struct user_namespace *user_ns; struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif } __randomize_layout; extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) refcount_inc(&ns->ns.count); return ns; } #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { int scope = MEMFD_NOEXEC_SCOPE_EXEC; for (; ns; ns = ns->parent) scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); return scope; } #else static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } #endif extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); #else /* !CONFIG_PID_NS */ #include <linux/err.h> static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { return ns; } static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } static inline struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); return ns; } static inline void put_pid_ns(struct pid_namespace *ns) { } static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) { return task_active_pid_ns(tsk) == &init_pid_ns; } #endif /* _LINUX_PID_NS_H */
593 1258 40 39 417 1 415 662 662 683 684 677 398 398 4 9687 9567 196 196 87 8 8 6 68 68 67 68 70 68 13 13 4 6599 6596 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { __get_task_comm(buf, buf_size, tsk); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); p->vfork_done = &kthread->exited; p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; do_exit(0); } /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param); set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) schedule(); try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); if (cpu >= 0) node = cpu_to_node(cpu); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; if (cpu >= 0) kthread_bind(task, cpu); worker->flags = flags; worker->task = task; wake_up_process(task); return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker - create a kthread worker * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(-1, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(cpu, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * del_timer_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif
410 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 // SPDX-License-Identifier: GPL-2.0-only /* * Common code for control of lockd and nfsv4 grace periods. * * Transplanted from lockd code */ #include <linux/module.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/fs.h> #include <linux/filelock.h> static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** * locks_start_grace * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * A grace period is a period during which locks should not be given * out. Currently grace periods are only enforced by the two lock * managers (lockd and nfsd), using the locks_in_grace() function to * check when they are in a grace period. * * This function is called to start a grace period. */ void locks_start_grace(struct net *net, struct lock_manager *lm) { struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); if (list_empty(&lm->list)) list_add(&lm->list, grace_list); else WARN(1, "double list_add attempt detected in net %x %s\n", net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to * resume regular locking. The grace period will not end until all lock * managers that called locks_start_grace() also call locks_end_grace(). * Note that callers count on it being safe to call this more than once, * and the second call should be a no-op. */ void locks_end_grace(struct lock_manager *lm) { spin_lock(&grace_lock); list_del_init(&lm->list); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_end_grace); static bool __state_in_grace(struct net *net, bool open) { struct list_head *grace_list = net_generic(net, grace_net_id); struct lock_manager *lm; if (!open) return !list_empty(grace_list); spin_lock(&grace_lock); list_for_each_entry(lm, grace_list, list) { if (lm->block_opens) { spin_unlock(&grace_lock); return true; } } spin_unlock(&grace_lock); return false; } /** * locks_in_grace * @net: network namespace * * Lock managers call this function to determine when it is OK for them * to answer ordinary lock requests, and when they should accept only * lock reclaims. */ bool locks_in_grace(struct net *net) { return __state_in_grace(net, false); } EXPORT_SYMBOL_GPL(locks_in_grace); bool opens_in_grace(struct net *net) { return __state_in_grace(net, true); } EXPORT_SYMBOL_GPL(opens_in_grace); static int __net_init grace_init_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); INIT_LIST_HEAD(grace_list); return 0; } static void __net_exit grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); WARN_ONCE(!list_empty(grace_list), "net %x %s: grace_list is not empty\n", net->ns.inum, __func__); } static struct pernet_operations grace_net_ops = { .init = grace_init_net, .exit = grace_exit_net, .id = &grace_net_id, .size = sizeof(struct list_head), }; static int __init init_grace(void) { return register_pernet_subsys(&grace_net_ops); } static void __exit exit_grace(void) { unregister_pernet_subsys(&grace_net_ops); } MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>"); MODULE_LICENSE("GPL"); module_init(init_grace) module_exit(exit_grace)
7 1262 1258 53 5 2122 2104 2107 460 2107 2119 59 58 59 59 2118 2118 2116 2089 1723 2120 1736 1817 2082 2152 1261 1237 862 1233 900 787 863 1704 1701 1703 728 728 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 // SPDX-License-Identifier: GPL-2.0 /* * fs/sysfs/group.c - Operations for adding/removing multiple files at once. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2013 Greg Kroah-Hartman * Copyright (c) 2013 The Linux Foundation */ #include <linux/kobject.h> #include <linux/module.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> #include <linux/fs.h> #include "sysfs.h" static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; struct bin_attribute *const *bin_attr; if (grp->attrs) for (attr = grp->attrs; *attr; attr++) kernfs_remove_by_name(parent, (*attr)->name); if (grp->bin_attrs) for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); } static int create_files(struct kernfs_node *parent, struct kobject *kobj, kuid_t uid, kgid_t gid, const struct attribute_group *grp, int update) { struct attribute *const *attr; struct bin_attribute *const *bin_attr; int error = 0, i; if (grp->attrs) { for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { umode_t mode = (*attr)->mode; /* * In update mode, we're changing the permissions or * visibility. Do this by first removing then * re-adding (if required) the file. */ if (update) kernfs_remove_by_name(parent, (*attr)->name); if (grp->is_visible) { mode = grp->is_visible(kobj, *attr, i); if (!mode) continue; } WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*attr)->name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, *attr, mode, uid, gid, NULL); if (unlikely(error)) break; } if (error) { remove_files(parent, grp); goto exit; } } if (grp->bin_attrs) { for (i = 0, bin_attr = grp->bin_attrs; *bin_attr; i++, bin_attr++) { umode_t mode = (*bin_attr)->attr.mode; if (update) kernfs_remove_by_name(parent, (*bin_attr)->attr.name); if (grp->is_bin_visible) { mode = grp->is_bin_visible(kobj, *bin_attr, i); if (!mode) continue; } WARN(mode & ~(SYSFS_PREALLOC | 0664), "Attribute %s: Invalid permissions 0%o\n", (*bin_attr)->attr.name, mode); mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_bin_file_mode_ns(parent, *bin_attr, mode, uid, gid, NULL); if (error) break; } if (error) remove_files(parent, grp); } exit: return error; } static int internal_create_group(struct kobject *kobj, int update, const struct attribute_group *grp) { struct kernfs_node *kn; kuid_t uid; kgid_t gid; int error; if (WARN_ON(!kobj || (!update && !kobj->sd))) return -EINVAL; /* Updates may happen before the object has been instantiated */ if (unlikely(update && !kobj->sd)) return -EINVAL; if (!grp->attrs && !grp->bin_attrs) { pr_debug("sysfs: (bin_)attrs not set by subsystem for group: %s/%s, skipping\n", kobj->name, grp->name ?: ""); return 0; } kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { if (update) { kn = kernfs_find_and_get(kobj->sd, grp->name); if (!kn) { pr_warn("Can't update unknown attr grp name: %s/%s\n", kobj->name, grp->name); return -EINVAL; } } else { kn = kernfs_create_dir_ns(kobj->sd, grp->name, S_IRWXU | S_IRUGO | S_IXUGO, uid, gid, kobj, NULL); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(kobj->sd, grp->name); return PTR_ERR(kn); } } } else { kn = kobj->sd; } kernfs_get(kn); error = create_files(kn, kobj, uid, gid, grp, update); if (error) { if (grp->name) kernfs_remove(kn); } kernfs_put(kn); if (grp->name && update) kernfs_put(kn); return error; } /** * sysfs_create_group - given a directory kobject, create an attribute group * @kobj: The kobject to create the group on * @grp: The attribute group to create * * This function creates a group for the first time. It will explicitly * warn and error if any of the attribute files being created already exist. * * Returns 0 on success or error code on failure. */ int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 0, grp); } EXPORT_SYMBOL_GPL(sysfs_create_group); static int internal_create_groups(struct kobject *kobj, int update, const struct attribute_group **groups) { int error = 0; int i; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = internal_create_group(kobj, update, groups[i]); if (error) { while (--i >= 0) sysfs_remove_group(kobj, groups[i]); break; } } return error; } /** * sysfs_create_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to create the group on * @groups: The attribute groups to create, NULL terminated * * This function creates a bunch of attribute groups. If an error occurs when * creating a group, all previously created groups will be removed, unwinding * everything back to the original state when this function was called. * It will explicitly warn and error if any of the attribute files being * created already exist. * * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 0, groups); } EXPORT_SYMBOL_GPL(sysfs_create_groups); /** * sysfs_update_groups - given a directory kobject, create a bunch of attribute groups * @kobj: The kobject to update the group on * @groups: The attribute groups to update, NULL terminated * * This function update a bunch of attribute groups. If an error occurs when * updating a group, all previously updated groups will be removed together * with already existing (not updated) attributes. * * Returns 0 on success or error code from sysfs_update_group on failure. */ int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return internal_create_groups(kobj, 1, groups); } EXPORT_SYMBOL_GPL(sysfs_update_groups); /** * sysfs_update_group - given a directory kobject, update an attribute group * @kobj: The kobject to update the group on * @grp: The attribute group to update * * This function updates an attribute group. Unlike * sysfs_create_group(), it will explicitly not warn or error if any * of the attribute files being created already exist. Furthermore, * if the visibility of the files has changed through the is_visible() * callback, it will update the permissions and add or remove the * relevant files. Changing a group's name (subdirectory name under * kobj's directory in sysfs) is not allowed. * * The primary use for this function is to call it after making a change * that affects group visibility. * * Returns 0 on success or error code on failure. */ int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return internal_create_group(kobj, 1, grp); } EXPORT_SYMBOL_GPL(sysfs_update_group); /** * sysfs_remove_group: remove a group from a kobject * @kobj: kobject to remove the group from * @grp: group to remove * * This function removes a group of attributes from a kobject. The attributes * previously have to have been created for this group, otherwise it will fail. */ void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent = kobj->sd; struct kernfs_node *kn; if (grp->name) { kn = kernfs_find_and_get(parent, grp->name); if (!kn) { WARN(!kn, KERN_WARNING "sysfs group '%s' not found for kobject '%s'\n", grp->name, kobject_name(kobj)); return; } } else { kn = parent; kernfs_get(kn); } remove_files(kn, grp); if (grp->name) kernfs_remove(kn); kernfs_put(kn); } EXPORT_SYMBOL_GPL(sysfs_remove_group); /** * sysfs_remove_groups - remove a list of groups * * @kobj: The kobject for the groups to be removed from * @groups: NULL terminated list of groups to be removed * * If groups is not NULL, remove the specified groups from the kobject. */ void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { int i; if (!groups) return; for (i = 0; groups[i]; i++) sysfs_remove_group(kobj, groups[i]); } EXPORT_SYMBOL_GPL(sysfs_remove_groups); /** * sysfs_merge_group - merge files into a pre-existing attribute group. * @kobj: The kobject containing the group. * @grp: The files to create and the attribute group they belong to. * * This function returns an error if the group doesn't exist or any of the * files already exist in that group, in which case none of the new files * are created. */ int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; kuid_t uid; kgid_t gid; int error = 0; struct attribute *const *attr; int i; parent = kernfs_find_and_get(kobj->sd, grp->name); if (!parent) return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode, uid, gid, NULL); if (error) { while (--i >= 0) kernfs_remove_by_name(parent, (*--attr)->name); } kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_merge_group); /** * sysfs_unmerge_group - remove files from a pre-existing attribute group. * @kobj: The kobject containing the group. * @grp: The files to remove and the attribute group they belong to. */ void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { struct kernfs_node *parent; struct attribute *const *attr; parent = kernfs_find_and_get(kobj->sd, grp->name); if (parent) { for (attr = grp->attrs; *attr; ++attr) kernfs_remove_by_name(parent, (*attr)->name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); /** * sysfs_add_link_to_group - add a symlink to an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @target: The target kobject of the symlink to create. * @link_name: The name of the symlink to create. */ int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { struct kernfs_node *parent; int error = 0; parent = kernfs_find_and_get(kobj->sd, group_name); if (!parent) return -ENOENT; error = sysfs_create_link_sd(parent, target, link_name); kernfs_put(parent); return error; } EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); /** * sysfs_remove_link_from_group - remove a symlink from an attribute group. * @kobj: The kobject containing the group. * @group_name: The name of the group. * @link_name: The name of the symlink to remove. */ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { struct kernfs_node *parent; parent = kernfs_find_and_get(kobj->sd, group_name); if (parent) { kernfs_remove_by_name(parent, link_name); kernfs_put(parent); } } EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); /** * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing * to a group or an attribute * @kobj: The kobject containing the group. * @target_kobj: The target kobject. * @target_name: The name of the target group or attribute. * @symlink_name: The name of the symlink file (target_name will be * considered if symlink_name is NULL). */ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { struct kernfs_node *target; struct kernfs_node *entry; struct kernfs_node *link; /* * We don't own @target_kobj and it may be removed at any time. * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir() * for details. */ spin_lock(&sysfs_symlink_target_lock); target = target_kobj->sd; if (target) kernfs_get(target); spin_unlock(&sysfs_symlink_target_lock); if (!target) return -ENOENT; entry = kernfs_find_and_get(target, target_name); if (!entry) { kernfs_put(target); return -ENOENT; } if (!symlink_name) symlink_name = target_name; link = kernfs_create_link(kobj->sd, symlink_name, entry); if (PTR_ERR(link) == -EEXIST) sysfs_warn_dup(kobj->sd, symlink_name); kernfs_put(entry); kernfs_put(target); return PTR_ERR_OR_ZERO(link); } EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, const struct attribute_group *grp, struct iattr *newattrs) { struct kernfs_node *kn; int error; if (grp->attrs) { struct attribute *const *attr; for (attr = grp->attrs; *attr; attr++) { kn = kernfs_find_and_get(grp_kn, (*attr)->name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } if (grp->bin_attrs) { struct bin_attribute *const *bin_attr; for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); if (!kn) return -ENOENT; error = kernfs_setattr(kn, newattrs); kernfs_put(kn); if (error) return error; } } return 0; } /** * sysfs_group_change_owner - change owner of an attribute group. * @kobj: The kobject containing the group. * @grp: The attribute group. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *grp, kuid_t kuid, kgid_t kgid) { struct kernfs_node *grp_kn; int error; struct iattr newattrs = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = kuid, .ia_gid = kgid, }; if (!kobj->state_in_sysfs) return -EINVAL; if (grp->name) { grp_kn = kernfs_find_and_get(kobj->sd, grp->name); } else { kernfs_get(kobj->sd); grp_kn = kobj->sd; } if (!grp_kn) return -ENOENT; error = kernfs_setattr(grp_kn, &newattrs); if (!error) error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs); kernfs_put(grp_kn); return error; } EXPORT_SYMBOL_GPL(sysfs_group_change_owner); /** * sysfs_groups_change_owner - change owner of a set of attribute groups. * @kobj: The kobject containing the groups. * @groups: The attribute groups. * @kuid: new owner's kuid * @kgid: new owner's kgid * * Returns 0 on success or error code on failure. */ int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { int error = 0, i; if (!kobj->state_in_sysfs) return -EINVAL; if (!groups) return 0; for (i = 0; groups[i]; i++) { error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid); if (error) break; } return error; } EXPORT_SYMBOL_GPL(sysfs_groups_change_owner);
73 1 3 1 61 2 46 17 61 3 56 7 54 21 3 16 16 1 1 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 // SPDX-License-Identifier: GPL-2.0-or-later /* * GRE over IPv4 demultiplexer driver * * Authors: Dmitry Kozlov (xeb@mail.ru) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/if.h> #include <linux/icmp.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/if_tunnel.h> #include <linux/spinlock.h> #include <net/protocol.h> #include <net/gre.h> #include <net/erspan.h> #include <net/icmp.h> #include <net/route.h> #include <net/xfrm.h> static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; int gre_add_protocol(const struct gre_protocol *proto, u8 version) { if (version >= GREPROTO_MAX) return -EINVAL; return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL_GPL(gre_add_protocol); int gre_del_protocol(const struct gre_protocol *proto, u8 version) { int ret; if (version >= GREPROTO_MAX) return -EINVAL; ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ? 0 : -EBUSY; if (ret) return ret; synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(gre_del_protocol); /* Fills in tpi and returns header length to be pulled. * Note that caller must use pskb_may_pull() before pulling GRE header. */ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs) { const struct gre_base_hdr *greh; __be32 *options; int hdr_len; if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr)))) return -EINVAL; greh = (struct gre_base_hdr *)(skb->data + nhs); if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; tpi->flags = gre_flags_to_tnl_flags(greh->flags); hdr_len = gre_calc_hlen(tpi->flags); if (!pskb_may_pull(skb, nhs + hdr_len)) return -EINVAL; greh = (struct gre_base_hdr *)(skb->data + nhs); tpi->proto = greh->protocol; options = (__be32 *)(greh + 1); if (greh->flags & GRE_CSUM) { if (!skb_checksum_simple_validate(skb)) { skb_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } else if (csum_err) { *csum_err = true; return -EINVAL; } options++; } if (greh->flags & GRE_KEY) { tpi->key = *options; options++; } else { tpi->key = 0; } if (unlikely(greh->flags & GRE_SEQ)) { tpi->seq = *options; options++; } else { tpi->seq = 0; } /* WCCP version 1 and 2 protocol decoding. * - Change protocol to IPv4/IPv6 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { u8 _val, *val; val = skb_header_pointer(skb, nhs + hdr_len, sizeof(_val), &_val); if (!val) return -EINVAL; tpi->proto = proto; if ((*val & 0xF0) != 0x40) hdr_len += 4; } tpi->hdr_len = hdr_len; /* ERSPAN ver 1 and 2 protocol sets GRE key field * to 0 and sets the configured key in the * inner erspan header field */ if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) || greh->protocol == htons(ETH_P_ERSPAN2)) { struct erspan_base_hdr *ershdr; if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr))) return -EINVAL; ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len); tpi->key = cpu_to_be32(get_session_id(ershdr)); } return hdr_len; } EXPORT_SYMBOL(gre_parse_header); static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; u8 ver; int ret; if (!pskb_may_pull(skb, 12)) goto drop; ver = skb->data[1]&0x7f; if (ver >= GREPROTO_MAX) goto drop; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (!proto || !proto->handler) goto drop_unlock; ret = proto->handler(skb); rcu_read_unlock(); return ret; drop_unlock: rcu_read_unlock(); drop: kfree_skb(skb); return NET_RX_DROP; } static int gre_err(struct sk_buff *skb, u32 info) { const struct gre_protocol *proto; const struct iphdr *iph = (const struct iphdr *)skb->data; u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; int err = 0; if (ver >= GREPROTO_MAX) return -EINVAL; rcu_read_lock(); proto = rcu_dereference(gre_proto[ver]); if (proto && proto->err_handler) proto->err_handler(skb, info); else err = -EPROTONOSUPPORT; rcu_read_unlock(); return err; } static const struct net_protocol net_gre_protocol = { .handler = gre_rcv, .err_handler = gre_err, }; static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); return -EAGAIN; } return 0; } static void __exit gre_exit(void) { inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } module_init(gre_init); module_exit(gre_exit); MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); MODULE_LICENSE("GPL");
3 2 2 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix * Copyright (C) 2006 Andrey Volkov, Varma Electronics * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/workqueue.h> #include <linux/can.h> #include <linux/can/can-ml.h> #include <linux/can/dev.h> #include <linux/can/skb.h> #include <linux/gpio/consumer.h> #include <linux/of.h> static void can_update_state_error_stats(struct net_device *dev, enum can_state new_state) { struct can_priv *priv = netdev_priv(dev); if (new_state <= priv->state) return; switch (new_state) { case CAN_STATE_ERROR_WARNING: priv->can_stats.error_warning++; break; case CAN_STATE_ERROR_PASSIVE: priv->can_stats.error_passive++; break; case CAN_STATE_BUS_OFF: priv->can_stats.bus_off++; break; default: break; } } static int can_tx_state_to_frame(struct net_device *dev, enum can_state state) { switch (state) { case CAN_STATE_ERROR_ACTIVE: return CAN_ERR_CRTL_ACTIVE; case CAN_STATE_ERROR_WARNING: return CAN_ERR_CRTL_TX_WARNING; case CAN_STATE_ERROR_PASSIVE: return CAN_ERR_CRTL_TX_PASSIVE; default: return 0; } } static int can_rx_state_to_frame(struct net_device *dev, enum can_state state) { switch (state) { case CAN_STATE_ERROR_ACTIVE: return CAN_ERR_CRTL_ACTIVE; case CAN_STATE_ERROR_WARNING: return CAN_ERR_CRTL_RX_WARNING; case CAN_STATE_ERROR_PASSIVE: return CAN_ERR_CRTL_RX_PASSIVE; default: return 0; } } const char *can_get_state_str(const enum can_state state) { switch (state) { case CAN_STATE_ERROR_ACTIVE: return "Error Active"; case CAN_STATE_ERROR_WARNING: return "Error Warning"; case CAN_STATE_ERROR_PASSIVE: return "Error Passive"; case CAN_STATE_BUS_OFF: return "Bus Off"; case CAN_STATE_STOPPED: return "Stopped"; case CAN_STATE_SLEEPING: return "Sleeping"; default: return "<unknown>"; } return "<unknown>"; } EXPORT_SYMBOL_GPL(can_get_state_str); static enum can_state can_state_err_to_state(u16 err) { if (err < CAN_ERROR_WARNING_THRESHOLD) return CAN_STATE_ERROR_ACTIVE; if (err < CAN_ERROR_PASSIVE_THRESHOLD) return CAN_STATE_ERROR_WARNING; if (err < CAN_BUS_OFF_THRESHOLD) return CAN_STATE_ERROR_PASSIVE; return CAN_STATE_BUS_OFF; } void can_state_get_by_berr_counter(const struct net_device *dev, const struct can_berr_counter *bec, enum can_state *tx_state, enum can_state *rx_state) { *tx_state = can_state_err_to_state(bec->txerr); *rx_state = can_state_err_to_state(bec->rxerr); } EXPORT_SYMBOL_GPL(can_state_get_by_berr_counter); void can_change_state(struct net_device *dev, struct can_frame *cf, enum can_state tx_state, enum can_state rx_state) { struct can_priv *priv = netdev_priv(dev); enum can_state new_state = max(tx_state, rx_state); if (unlikely(new_state == priv->state)) { netdev_warn(dev, "%s: oops, state did not change", __func__); return; } netdev_dbg(dev, "Controller changed from %s State (%d) into %s State (%d).\n", can_get_state_str(priv->state), priv->state, can_get_state_str(new_state), new_state); can_update_state_error_stats(dev, new_state); priv->state = new_state; if (!cf) return; if (unlikely(new_state == CAN_STATE_BUS_OFF)) { cf->can_id |= CAN_ERR_BUSOFF; return; } cf->can_id |= CAN_ERR_CRTL; cf->data[1] |= tx_state >= rx_state ? can_tx_state_to_frame(dev, tx_state) : 0; cf->data[1] |= tx_state <= rx_state ? can_rx_state_to_frame(dev, rx_state) : 0; } EXPORT_SYMBOL_GPL(can_change_state); /* CAN device restart for bus-off recovery */ static void can_restart(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); struct sk_buff *skb; struct can_frame *cf; int err; if (netif_carrier_ok(dev)) netdev_err(dev, "Attempt to restart for bus-off recovery, but carrier is OK?\n"); /* No synchronization needed because the device is bus-off and * no messages can come in or go out. */ can_flush_echo_skb(dev); /* send restart message upstream */ skb = alloc_can_err_skb(dev, &cf); if (skb) { cf->can_id |= CAN_ERR_RESTARTED; netif_rx(skb); } /* Now restart the device */ netif_carrier_on(dev); err = priv->do_set_mode(dev, CAN_MODE_START); if (err) { netdev_err(dev, "Restart failed, error %pe\n", ERR_PTR(err)); netif_carrier_off(dev); } else { netdev_dbg(dev, "Restarted\n"); priv->can_stats.restarts++; } } static void can_restart_work(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct can_priv *priv = container_of(dwork, struct can_priv, restart_work); can_restart(priv->dev); } int can_restart_now(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); /* A manual restart is only permitted if automatic restart is * disabled and the device is in the bus-off state */ if (priv->restart_ms) return -EINVAL; if (priv->state != CAN_STATE_BUS_OFF) return -EBUSY; cancel_delayed_work_sync(&priv->restart_work); can_restart(dev); return 0; } /* CAN bus-off * * This functions should be called when the device goes bus-off to * tell the netif layer that no more packets can be sent or received. * If enabled, a timer is started to trigger bus-off recovery. */ void can_bus_off(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); if (priv->restart_ms) netdev_info(dev, "bus-off, scheduling restart in %d ms\n", priv->restart_ms); else netdev_info(dev, "bus-off\n"); netif_carrier_off(dev); if (priv->restart_ms) schedule_delayed_work(&priv->restart_work, msecs_to_jiffies(priv->restart_ms)); } EXPORT_SYMBOL_GPL(can_bus_off); void can_setup(struct net_device *dev) { dev->type = ARPHRD_CAN; dev->mtu = CAN_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 10; /* New-style flags. */ dev->flags = IFF_NOARP; dev->features = NETIF_F_HW_CSUM; } /* Allocate and setup space for the CAN network device */ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, unsigned int txqs, unsigned int rxqs) { struct can_ml_priv *can_ml; struct net_device *dev; struct can_priv *priv; int size; /* We put the driver's priv, the CAN mid layer priv and the * echo skb into the netdevice's priv. The memory layout for * the netdev_priv is like this: * * +-------------------------+ * | driver's priv | * +-------------------------+ * | struct can_ml_priv | * +-------------------------+ * | array of struct sk_buff | * +-------------------------+ */ size = ALIGN(sizeof_priv, NETDEV_ALIGN) + sizeof(struct can_ml_priv); if (echo_skb_max) size = ALIGN(size, sizeof(struct sk_buff *)) + echo_skb_max * sizeof(struct sk_buff *); dev = alloc_netdev_mqs(size, "can%d", NET_NAME_UNKNOWN, can_setup, txqs, rxqs); if (!dev) return NULL; priv = netdev_priv(dev); priv->dev = dev; can_ml = (void *)priv + ALIGN(sizeof_priv, NETDEV_ALIGN); can_set_ml_priv(dev, can_ml); if (echo_skb_max) { priv->echo_skb_max = echo_skb_max; priv->echo_skb = (void *)priv + (size - echo_skb_max * sizeof(struct sk_buff *)); } priv->state = CAN_STATE_STOPPED; INIT_DELAYED_WORK(&priv->restart_work, can_restart_work); return dev; } EXPORT_SYMBOL_GPL(alloc_candev_mqs); /* Free space of the CAN network device */ void free_candev(struct net_device *dev) { free_netdev(dev); } EXPORT_SYMBOL_GPL(free_candev); /* changing MTU and control mode for CAN/CANFD devices */ int can_change_mtu(struct net_device *dev, int new_mtu) { struct can_priv *priv = netdev_priv(dev); u32 ctrlmode_static = can_get_static_ctrlmode(priv); /* Do not allow changing the MTU while running */ if (dev->flags & IFF_UP) return -EBUSY; /* allow change of MTU according to the CANFD ability of the device */ switch (new_mtu) { case CAN_MTU: /* 'CANFD-only' controllers can not switch to CAN_MTU */ if (ctrlmode_static & CAN_CTRLMODE_FD) return -EINVAL; priv->ctrlmode &= ~CAN_CTRLMODE_FD; break; case CANFD_MTU: /* check for potential CANFD ability */ if (!(priv->ctrlmode_supported & CAN_CTRLMODE_FD) && !(ctrlmode_static & CAN_CTRLMODE_FD)) return -EINVAL; priv->ctrlmode |= CAN_CTRLMODE_FD; break; default: return -EINVAL; } dev->mtu = new_mtu; return 0; } EXPORT_SYMBOL_GPL(can_change_mtu); /* generic implementation of netdev_ops::ndo_eth_ioctl for CAN devices * supporting hardware timestamps */ int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd) { struct hwtstamp_config hwts_cfg = { 0 }; switch (cmd) { case SIOCSHWTSTAMP: /* set */ if (copy_from_user(&hwts_cfg, ifr->ifr_data, sizeof(hwts_cfg))) return -EFAULT; if (hwts_cfg.tx_type == HWTSTAMP_TX_ON && hwts_cfg.rx_filter == HWTSTAMP_FILTER_ALL) return 0; return -ERANGE; case SIOCGHWTSTAMP: /* get */ hwts_cfg.tx_type = HWTSTAMP_TX_ON; hwts_cfg.rx_filter = HWTSTAMP_FILTER_ALL; if (copy_to_user(ifr->ifr_data, &hwts_cfg, sizeof(hwts_cfg))) return -EFAULT; return 0; default: return -EOPNOTSUPP; } } EXPORT_SYMBOL(can_eth_ioctl_hwts); /* generic implementation of ethtool_ops::get_ts_info for CAN devices * supporting hardware timestamps */ int can_ethtool_op_get_ts_info_hwts(struct net_device *dev, struct ethtool_ts_info *info) { info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_TX_HARDWARE | SOF_TIMESTAMPING_RX_HARDWARE | SOF_TIMESTAMPING_RAW_HARDWARE; info->phc_index = -1; info->tx_types = BIT(HWTSTAMP_TX_ON); info->rx_filters = BIT(HWTSTAMP_FILTER_ALL); return 0; } EXPORT_SYMBOL(can_ethtool_op_get_ts_info_hwts); /* Common open function when the device gets opened. * * This function should be called in the open function of the device * driver. */ int open_candev(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); if (!priv->bittiming.bitrate) { netdev_err(dev, "bit-timing not yet defined\n"); return -EINVAL; } /* For CAN FD the data bitrate has to be >= the arbitration bitrate */ if ((priv->ctrlmode & CAN_CTRLMODE_FD) && (!priv->data_bittiming.bitrate || priv->data_bittiming.bitrate < priv->bittiming.bitrate)) { netdev_err(dev, "incorrect/missing data bit-timing\n"); return -EINVAL; } /* Switch carrier on if device was stopped while in bus-off state */ if (!netif_carrier_ok(dev)) netif_carrier_on(dev); return 0; } EXPORT_SYMBOL_GPL(open_candev); #ifdef CONFIG_OF /* Common function that can be used to understand the limitation of * a transceiver when it provides no means to determine these limitations * at runtime. */ void of_can_transceiver(struct net_device *dev) { struct device_node *dn; struct can_priv *priv = netdev_priv(dev); struct device_node *np = dev->dev.parent->of_node; int ret; dn = of_get_child_by_name(np, "can-transceiver"); if (!dn) return; ret = of_property_read_u32(dn, "max-bitrate", &priv->bitrate_max); of_node_put(dn); if ((ret && ret != -EINVAL) || (!ret && !priv->bitrate_max)) netdev_warn(dev, "Invalid value for transceiver max bitrate. Ignoring bitrate limit.\n"); } EXPORT_SYMBOL_GPL(of_can_transceiver); #endif /* Common close function for cleanup before the device gets closed. * * This function should be called in the close function of the device * driver. */ void close_candev(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); cancel_delayed_work_sync(&priv->restart_work); can_flush_echo_skb(dev); } EXPORT_SYMBOL_GPL(close_candev); static int can_set_termination(struct net_device *ndev, u16 term) { struct can_priv *priv = netdev_priv(ndev); int set; if (term == priv->termination_gpio_ohms[CAN_TERMINATION_GPIO_ENABLED]) set = 1; else set = 0; gpiod_set_value(priv->termination_gpio, set); return 0; } static int can_get_termination(struct net_device *ndev) { struct can_priv *priv = netdev_priv(ndev); struct device *dev = ndev->dev.parent; struct gpio_desc *gpio; u32 term; int ret; /* Disabling termination by default is the safe choice: Else if many * bus participants enable it, no communication is possible at all. */ gpio = devm_gpiod_get_optional(dev, "termination", GPIOD_OUT_LOW); if (IS_ERR(gpio)) return dev_err_probe(dev, PTR_ERR(gpio), "Cannot get termination-gpios\n"); if (!gpio) return 0; ret = device_property_read_u32(dev, "termination-ohms", &term); if (ret) { netdev_err(ndev, "Cannot get termination-ohms: %pe\n", ERR_PTR(ret)); return ret; } if (term > U16_MAX) { netdev_err(ndev, "Invalid termination-ohms value (%u > %u)\n", term, U16_MAX); return -EINVAL; } priv->termination_const_cnt = ARRAY_SIZE(priv->termination_gpio_ohms); priv->termination_const = priv->termination_gpio_ohms; priv->termination_gpio = gpio; priv->termination_gpio_ohms[CAN_TERMINATION_GPIO_DISABLED] = CAN_TERMINATION_DISABLED; priv->termination_gpio_ohms[CAN_TERMINATION_GPIO_ENABLED] = term; priv->do_set_termination = can_set_termination; return 0; } static bool can_bittiming_const_valid(const struct can_bittiming_const *btc) { if (!btc) return true; if (!btc->sjw_max) return false; return true; } /* Register the CAN network device */ int register_candev(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); int err; /* Ensure termination_const, termination_const_cnt and * do_set_termination consistency. All must be either set or * unset. */ if ((!priv->termination_const != !priv->termination_const_cnt) || (!priv->termination_const != !priv->do_set_termination)) return -EINVAL; if (!priv->bitrate_const != !priv->bitrate_const_cnt) return -EINVAL; if (!priv->data_bitrate_const != !priv->data_bitrate_const_cnt) return -EINVAL; /* We only support either fixed bit rates or bit timing const. */ if ((priv->bitrate_const || priv->data_bitrate_const) && (priv->bittiming_const || priv->data_bittiming_const)) return -EINVAL; if (!can_bittiming_const_valid(priv->bittiming_const) || !can_bittiming_const_valid(priv->data_bittiming_const)) return -EINVAL; if (!priv->termination_const) { err = can_get_termination(dev); if (err) return err; } dev->rtnl_link_ops = &can_link_ops; netif_carrier_off(dev); return register_netdev(dev); } EXPORT_SYMBOL_GPL(register_candev); /* Unregister the CAN network device */ void unregister_candev(struct net_device *dev) { unregister_netdev(dev); } EXPORT_SYMBOL_GPL(unregister_candev); /* Test if a network device is a candev based device * and return the can_priv* if so. */ struct can_priv *safe_candev_priv(struct net_device *dev) { if (dev->type != ARPHRD_CAN || dev->rtnl_link_ops != &can_link_ops) return NULL; return netdev_priv(dev); } EXPORT_SYMBOL_GPL(safe_candev_priv); static __init int can_dev_init(void) { int err; err = can_netlink_register(); if (!err) pr_info("CAN device driver interface\n"); return err; } module_init(can_dev_init); static __exit void can_dev_exit(void) { can_netlink_unregister(); } module_exit(can_dev_exit); MODULE_ALIAS_RTNL_LINK("can");
2 2 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct privflags_req_info { struct ethnl_req_info base; }; struct privflags_reply_data { struct ethnl_reply_data base; const char (*priv_flag_names)[ETH_GSTRING_LEN]; unsigned int n_priv_flags; u32 priv_flags; }; #define PRIVFLAGS_REPDATA(__reply_base) \ container_of(__reply_base, struct privflags_reply_data, base) const struct nla_policy ethnl_privflags_get_policy[] = { [ETHTOOL_A_PRIVFLAGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int ethnl_get_priv_flags_info(struct net_device *dev, unsigned int *count, const char (**names)[ETH_GSTRING_LEN]) { const struct ethtool_ops *ops = dev->ethtool_ops; int nflags; nflags = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (nflags < 0) return nflags; if (names) { *names = kcalloc(nflags, ETH_GSTRING_LEN, GFP_KERNEL); if (!*names) return -ENOMEM; ops->get_strings(dev, ETH_SS_PRIV_FLAGS, (u8 *)*names); } /* We can pass more than 32 private flags to userspace via netlink but * we cannot get more with ethtool_ops::get_priv_flags(). Note that we * must not adjust nflags before allocating the space for flag names * as the buffer must be large enough for all flags. */ if (WARN_ONCE(nflags > 32, "device %s reports more than 32 private flags (%d)\n", netdev_name(dev), nflags)) nflags = 32; *count = nflags; return 0; } static int privflags_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const char (*names)[ETH_GSTRING_LEN]; const struct ethtool_ops *ops; unsigned int nflags; int ret; ops = dev->ethtool_ops; if (!ops->get_priv_flags || !ops->get_sset_count || !ops->get_strings) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = ethnl_get_priv_flags_info(dev, &nflags, &names); if (ret < 0) goto out_ops; data->priv_flags = ops->get_priv_flags(dev); data->priv_flag_names = names; data->n_priv_flags = nflags; out_ops: ethnl_ops_complete(dev); return ret; } static int privflags_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags); return ethnl_bitset32_size(&data->priv_flags, &all_flags, data->n_priv_flags, data->priv_flag_names, compact); } static int privflags_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags); return ethnl_put_bitset32(skb, ETHTOOL_A_PRIVFLAGS_FLAGS, &data->priv_flags, &all_flags, data->n_priv_flags, data->priv_flag_names, compact); } static void privflags_cleanup_data(struct ethnl_reply_data *reply_data) { struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_data); kfree(data->priv_flag_names); } /* PRIVFLAGS_SET */ const struct nla_policy ethnl_privflags_set_policy[] = { [ETHTOOL_A_PRIVFLAGS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED }, }; static int ethnl_set_privflags_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; if (!info->attrs[ETHTOOL_A_PRIVFLAGS_FLAGS]) return -EINVAL; if (!ops->get_priv_flags || !ops->set_priv_flags || !ops->get_sset_count || !ops->get_strings) return -EOPNOTSUPP; return 1; } static int ethnl_set_privflags(struct ethnl_req_info *req_info, struct genl_info *info) { const char (*names)[ETH_GSTRING_LEN] = NULL; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; unsigned int nflags; bool mod = false; bool compact; u32 flags; int ret; ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact); if (ret < 0) return ret; ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names); if (ret < 0) return ret; flags = dev->ethtool_ops->get_priv_flags(dev); ret = ethnl_update_bitset32(&flags, nflags, tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names, info->extack, &mod); if (ret < 0 || !mod) goto out_free; ret = dev->ethtool_ops->set_priv_flags(dev, flags); if (ret < 0) goto out_free; ret = 1; out_free: kfree(names); return ret; } const struct ethnl_request_ops ethnl_privflags_request_ops = { .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET, .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER, .req_info_size = sizeof(struct privflags_req_info), .reply_data_size = sizeof(struct privflags_reply_data), .prepare_data = privflags_prepare_data, .reply_size = privflags_reply_size, .fill_reply = privflags_fill_reply, .cleanup_data = privflags_cleanup_data, .set_validate = ethnl_set_privflags_validate, .set = ethnl_set_privflags, .set_ntf_cmd = ETHTOOL_MSG_PRIVFLAGS_NTF, };
4 8 12 1 1 3 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/interrupt.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_limit.h> struct xt_limit_priv { unsigned long prev; u32 credit; }; MODULE_LICENSE("GPL"); MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); MODULE_DESCRIPTION("Xtables: rate-limit match"); MODULE_ALIAS("ipt_limit"); MODULE_ALIAS("ip6t_limit"); /* The algorithm used is the Simple Token Bucket Filter (TBF) * see net/sched/sch_tbf.c in the linux source tree */ /* Rusty: This is my (non-mathematically-inclined) understanding of this algorithm. The `average rate' in jiffies becomes your initial amount of credit `credit' and the most credit you can ever have `credit_cap'. The `peak rate' becomes the cost of passing the test, `cost'. `prev' tracks the last packet hit: you gain one credit per jiffy. If you get credit balance more than this, the extra credit is discarded. Every time the match passes, you lose `cost' credits; if you don't have that many, the test fails. See Alexey's formal explanation in net/sched/sch_tbf.c. To get the maximum range, we multiply by this factor (ie. you get N credits per jiffy). We want to allow a rate as low as 1 per day (slowest userspace tool allows), which means CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */ #define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) /* Repeated shift and or gives us all 1s, final shift and add 1 gives * us the power of 2 below the theoretical max, so GCC simply does a * shift. */ #define _POW2_BELOW2(x) ((x)|((x)>>1)) #define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) #define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) #define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) static bool limit_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rateinfo *r = par->matchinfo; struct xt_limit_priv *priv = r->master; unsigned long now; u32 old_credit, new_credit, credit_increase = 0; bool ret; /* fastpath if there is nothing to update */ if ((READ_ONCE(priv->credit) < r->cost) && (READ_ONCE(priv->prev) == jiffies)) return false; do { now = jiffies; credit_increase += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; old_credit = READ_ONCE(priv->credit); new_credit = old_credit; new_credit += credit_increase; if (new_credit > r->credit_cap) new_credit = r->credit_cap; if (new_credit >= r->cost) { ret = true; new_credit -= r->cost; } else { ret = false; } } while (cmpxchg(&priv->credit, old_credit, new_credit) != old_credit); return ret; } /* Precision saver. */ static u32 user2credits(u32 user) { /* If multiplying would overflow... */ if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) /* Divide first. */ return (user / XT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE; } static int limit_mt_check(const struct xt_mtchk_param *par) { struct xt_rateinfo *r = par->matchinfo; struct xt_limit_priv *priv; /* Check for overflow. */ if (r->burst == 0 || user2credits(r->avg * r->burst) < user2credits(r->avg)) { pr_info_ratelimited("Overflow, try lower: %u/%u\n", r->avg, r->burst); return -ERANGE; } priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (priv == NULL) return -ENOMEM; /* For SMP, we only want to use one set of state. */ r->master = priv; /* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies * 128. */ priv->prev = jiffies; priv->credit = user2credits(r->avg * r->burst); /* Credits full. */ if (r->cost == 0) { r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } return 0; } static void limit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_rateinfo *info = par->matchinfo; kfree(info->master); } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_xt_rateinfo { u_int32_t avg; u_int32_t burst; compat_ulong_t prev; u_int32_t credit; u_int32_t credit_cap, cost; u_int32_t master; }; /* To keep the full "prev" timestamp, the upper 32 bits are stored in the * master pointer, which does not need to be preserved. */ static void limit_mt_compat_from_user(void *dst, const void *src) { const struct compat_xt_rateinfo *cm = src; struct xt_rateinfo m = { .avg = cm->avg, .burst = cm->burst, .prev = cm->prev | (unsigned long)cm->master << 32, .credit = cm->credit, .credit_cap = cm->credit_cap, .cost = cm->cost, }; memcpy(dst, &m, sizeof(m)); } static int limit_mt_compat_to_user(void __user *dst, const void *src) { const struct xt_rateinfo *m = src; struct compat_xt_rateinfo cm = { .avg = m->avg, .burst = m->burst, .prev = m->prev, .credit = m->credit, .credit_cap = m->credit_cap, .cost = m->cost, .master = m->prev >> 32, }; return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; } #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_match limit_mt_reg __read_mostly = { .name = "limit", .revision = 0, .family = NFPROTO_UNSPEC, .match = limit_mt, .checkentry = limit_mt_check, .destroy = limit_mt_destroy, .matchsize = sizeof(struct xt_rateinfo), #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_xt_rateinfo), .compat_from_user = limit_mt_compat_from_user, .compat_to_user = limit_mt_compat_to_user, #endif .usersize = offsetof(struct xt_rateinfo, prev), .me = THIS_MODULE, }; static int __init limit_mt_init(void) { return xt_register_match(&limit_mt_reg); } static void __exit limit_mt_exit(void) { xt_unregister_match(&limit_mt_reg); } module_init(limit_mt_init); module_exit(limit_mt_exit);
121 122 702 700 138 623 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage clock event devices. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/clockchips.h> #include <linux/hrtimer.h> #include <linux/init.h> #include <linux/module.h> #include <linux/smp.h> #include <linux/device.h> #include "tick-internal.h" /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); /* Protection for unbind operations */ static DEFINE_MUTEX(clockevents_mutex); struct ce_unbind { struct clock_event_device *ce; int res; }; static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, bool ismax) { u64 clc = (u64) latch << evt->shift; u64 rnd; if (WARN_ON(!evt->mult)) evt->mult = 1; rnd = (u64) evt->mult - 1; /* * Upper bound sanity check. If the backwards conversion is * not equal latch, we know that the above shift overflowed. */ if ((clc >> evt->shift) != (u64)latch) clc = ~0ULL; /* * Scaled math oddities: * * For mult <= (1 << shift) we can safely add mult - 1 to * prevent integer rounding loss. So the backwards conversion * from nsec to device ticks will be correct. * * For mult > (1 << shift), i.e. device frequency is > 1GHz we * need to be careful. Adding mult - 1 will result in a value * which when converted back to device ticks can be larger * than latch by up to (mult - 1) >> shift. For the min_delta * calculation we still want to apply this in order to stay * above the minimum device ticks limit. For the upper limit * we would end up with a latch value larger than the upper * limit of the device, so we omit the add to stay below the * device upper boundary. * * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); /* Deltas less than 1usec are pointless noise */ return clc > 1000 ? clc : 1000; } /** * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds * @latch: value to convert * @evt: pointer to clock event device descriptor * * Math helper, returns latch value converted to nanoseconds (bound checked) */ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) { return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); static int __clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; /* Transition with new state-specific callbacks */ switch (state) { case CLOCK_EVT_STATE_DETACHED: /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: if (dev->set_state_shutdown) return dev->set_state_shutdown(dev); return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; if (dev->set_state_periodic) return dev->set_state_periodic(dev); return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; if (dev->set_state_oneshot) return dev->set_state_oneshot(dev); return 0; case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ if (WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev))) return -EINVAL; if (dev->set_state_oneshot_stopped) return dev->set_state_oneshot_stopped(dev); else return -ENOSYS; default: return -ENOSYS; } } /** * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ void clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (clockevent_get_state(dev) != state) { if (__clockevents_switch_state(dev, state)) return; clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ if (clockevent_state_oneshot(dev)) { if (WARN_ON(!dev->mult)) dev->mult = 1; } } } /** * clockevents_shutdown - shutdown the device and clear next_event * @dev: device to shutdown */ void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event = KTIME_MAX; } /** * clockevents_tick_resume - Resume the tick device before using it again * @dev: device to resume */ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; if (dev->tick_resume) ret = dev->tick_resume(dev); return ret; } #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST /* Limit min_delta to a jiffie */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** * clockevents_increase_min_delta - raise minimum delta of a clock event device * @dev: device to increase the minimum delta * * Returns 0 on success, -ETIME when the minimum delta reached the limit. */ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { printk_deferred(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); dev->next_event = KTIME_MAX; return -ETIME; } if (dev->min_delta_ns < 5000) dev->min_delta_ns = 5000; else dev->min_delta_ns += dev->min_delta_ns >> 1; if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; printk_deferred(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", dev->name ? dev->name : "?", (unsigned long long) dev->min_delta_ns); return 0; } /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta; int i; for (i = 0;;) { delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; if (++i > 2) { /* * We tried 3 times to program the device with the * given min_delta_ns. Try to increase the minimum * delta, if that fails as well get out of here. */ if (clockevents_increase_min_delta(dev)) return -ETIME; i = 0; } } } #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta = 0; int i; for (i = 0; i < 10; i++) { delta += dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; } return -ETIME; } #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program * @expires: absolute expiry time (monotonic clock) * @force: program minimum delay if expires can not be set * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { unsigned long long clc; int64_t delta; int rc; if (WARN_ON_ONCE(expires < 0)) return -ETIME; dev->next_event = expires; if (clockevent_state_shutdown(dev)) return 0; /* We must be in ONESHOT state here */ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; delta = min(delta, (int64_t) dev->max_delta_ns); delta = max(delta, (int64_t) dev->min_delta_ns); clc = ((unsigned long long) delta * dev->mult) >> dev->shift; rc = dev->set_next_event((unsigned long) clc, dev); return (rc && force) ? clockevents_program_min_delta(dev) : rc; } /* * Called after a notify add to make devices available which were * released from the notifier call. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); list_move(&dev->list, &clockevent_devices); tick_check_new_device(dev); } } /* * Try to install a replacement clock event device */ static int clockevents_replace(struct clock_event_device *ced) { struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) continue; if (!try_module_get(dev->owner)) continue; if (newdev) module_put(newdev->owner); newdev = dev; } if (newdev) { tick_install_replacement(newdev); list_del_init(&ced->list); } return newdev ? 0 : -EBUSY; } /* * Called with clockevents_mutex and clockevents_lock held */ static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; } /* * SMP function call to unbind a device */ static void __clockevents_unbind(void *arg) { struct ce_unbind *cu = arg; int res; raw_spin_lock(&clockevents_lock); res = __clockevents_try_unbind(cu->ce, smp_processor_id()); if (res == -EAGAIN) res = clockevents_replace(cu->ce); cu->res = res; raw_spin_unlock(&clockevents_lock); } /* * Issues smp function call to unbind a per cpu device. Called with * clockevents_mutex held. */ static int clockevents_unbind(struct clock_event_device *ced, int cpu) { struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); return cu.res; } /* * Unbind a clockevents device. */ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) { int ret; mutex_lock(&clockevents_mutex); ret = clockevents_unbind(ced, cpu); mutex_unlock(&clockevents_mutex); return ret; } EXPORT_SYMBOL_GPL(clockevents_unbind_device); /** * clockevents_register_device - register a clock event device * @dev: device to register */ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; /* Initialize state to DETACHED */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); } if (dev->cpumask == cpu_all_mask) { WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n", dev->name); dev->cpumask = cpu_possible_mask; } raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); static void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; /* * Calculate the maximum number of seconds we can sleep. Limit * to 10 minutes for hardware which can program more than * 32bit ticks so we still get reasonable conversion values. */ sec = dev->max_delta_ticks; do_div(sec, freq); if (!sec) sec = 1; else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** * clockevents_config_and_register - Configure and register a clock event device * @dev: device to register * @freq: The clock frequency * @min_delta: The minimum clock ticks to program in oneshot mode * @max_delta: The maximum clock ticks to program in oneshot mode * * min/max_delta can be 0 for devices which do not support oneshot mode. */ void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta) { dev->min_delta_ticks = min_delta; dev->max_delta_ticks = max_delta; clockevents_config(dev, freq); clockevents_register_device(dev); } EXPORT_SYMBOL_GPL(clockevents_config_and_register); int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); if (clockevent_state_periodic(dev)) return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify * @freq: new device frequency * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per * cpu timer events. If called for the broadcast device the core takes * care of serialization. * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { unsigned long flags; int ret; local_irq_save(flags); ret = tick_broadcast_update_freq(dev, freq); if (ret == -ENODEV) ret = __clockevents_update_freq(dev, freq); local_irq_restore(flags); return ret; } /* * Noop handler when we shut down an event device */ void clockevents_handle_noop(struct clock_event_device *dev) { } /** * clockevents_exchange_device - release and request clock devices * @old: device to release (can be NULL) * @new: device to request (can be NULL) * * Called from various tick functions with clockevents_lock held and * interrupts disabled. */ void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new) { /* * Caller releases a clock event device. We queue it into the * released list and do a notify add later. */ if (old) { module_put(old->owner); clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_move(&old->list, &clockevents_released); } if (new) { BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } /** * clockevents_suspend - suspend clock devices */ void clockevents_suspend(void) { struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } /** * clockevents_resume - resume clock devices */ void clockevents_resume(void) { struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } #ifdef CONFIG_HOTPLUG_CPU # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST /** * tick_offline_cpu - Take CPU out of the broadcast mechanism * @cpu: The outgoing CPU * * Called on the outgoing CPU after it took itself offline. */ void tick_offline_cpu(unsigned int cpu) { raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); raw_spin_unlock(&clockevents_lock); } # endif /** * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu * @cpu: The dead CPU */ void tick_cleanup_dead_cpu(int cpu) { struct clock_event_device *dev, *tmp; unsigned long flags; raw_spin_lock_irqsave(&clockevents_lock, flags); tick_shutdown(cpu); /* * Unregister the clock event devices which were * released from the users in the notify chain. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); /* * Now check whether the CPU has left unused per cpu devices */ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } raw_spin_unlock_irqrestore(&clockevents_lock, flags); } #endif #ifdef CONFIG_SYSFS static struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; static DEFINE_PER_CPU(struct device, tick_percpu_dev); static struct tick_device *tick_get_tick_dev(struct device *dev); static ssize_t current_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tick_device *td; ssize_t count = 0; raw_spin_lock_irq(&clockevents_lock); td = tick_get_tick_dev(dev); if (td && td->evtdev) count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); raw_spin_unlock_irq(&clockevents_lock); return count; } static DEVICE_ATTR_RO(current_device); /* We don't support the abomination of removable broadcast devices */ static ssize_t unbind_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce = NULL, *iter; if (ret < 0) return ret; ret = -ENODEV; mutex_lock(&clockevents_mutex); raw_spin_lock_irq(&clockevents_lock); list_for_each_entry(iter, &clockevent_devices, list) { if (!strcmp(iter->name, name)) { ret = __clockevents_try_unbind(iter, dev->id); ce = iter; break; } } raw_spin_unlock_irq(&clockevents_lock); /* * We hold clockevents_mutex, so ce can't go away */ if (ret == -EAGAIN) ret = clockevents_unbind(ce, dev->id); mutex_unlock(&clockevents_mutex); return ret ? ret : count; } static DEVICE_ATTR_WO(unbind_device); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", .id = 0, .bus = &clockevents_subsys, }; static struct tick_device *tick_get_tick_dev(struct device *dev) { return dev == &tick_bc_dev ? tick_get_broadcast_device() : &per_cpu(tick_cpu_device, dev->id); } static __init int tick_broadcast_init_sysfs(void) { int err = device_register(&tick_bc_dev); if (!err) err = device_create_file(&tick_bc_dev, &dev_attr_current_device); return err; } #else static struct tick_device *tick_get_tick_dev(struct device *dev) { return &per_cpu(tick_cpu_device, dev->id); } static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif static int __init tick_init_sysfs(void) { int cpu; for_each_possible_cpu(cpu) { struct device *dev = &per_cpu(tick_percpu_dev, cpu); int err; dev->id = cpu; dev->bus = &clockevents_subsys; err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); if (!err) err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } return tick_broadcast_init_sysfs(); } static int __init clockevents_init_sysfs(void) { int err = subsys_system_register(&clockevents_subsys, NULL); if (!err) err = tick_init_sysfs(); return err; } device_initcall(clockevents_init_sysfs); #endif /* SYSFS */
75 75 77 77 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 // SPDX-License-Identifier: GPL-2.0-only /* * lowlevel.c * * PURPOSE * Low Level Device Routines for the UDF filesystem * * COPYRIGHT * (C) 1999-2001 Ben Fennema * * HISTORY * * 03/26/99 blf Created. */ #include "udfdecl.h" #include <linux/blkdev.h> #include <linux/cdrom.h> #include <linux/uaccess.h> #include "udf_sb.h" unsigned int udf_get_last_session(struct super_block *sb) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); struct cdrom_multisession ms_info; if (!cdi) { udf_debug("CDROMMULTISESSION not supported.\n"); return 0; } ms_info.addr_format = CDROM_LBA; if (cdrom_multisession(cdi, &ms_info) == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba); if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ return ms_info.addr.lba; } return 0; } udf_pblk_t udf_get_last_block(struct super_block *sb) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned long lblock = 0; /* * The cdrom layer call failed or returned obviously bogus value? * Try using the device size... */ if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) { if (sb_bdev_nr_blocks(sb) > ~(udf_pblk_t)0) return 0; lblock = sb_bdev_nr_blocks(sb); } if (lblock) return lblock - 1; return 0; }
2 495 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 /* SPDX-License-Identifier: GPL-2.0 */ /* * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2018-2022 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H #include <linux/list.h> #include <linux/netdevice.h> #include <linux/rbtree.h> #include <linux/debugfs.h> #include <linux/rfkill.h> #include <linux/workqueue.h> #include <linux/rtnetlink.h> #include <net/genetlink.h> #include <net/cfg80211.h> #include "reg.h" #define WIPHY_IDX_INVALID -1 struct cfg80211_registered_device { const struct cfg80211_ops *ops; struct list_head list; /* rfkill support */ struct rfkill_ops rfkill_ops; struct work_struct rfkill_block; /* ISO / IEC 3166 alpha2 for which this device is receiving * country IEs on, this can help disregard country IEs from APs * on the same alpha2 quickly. The alpha2 may differ from * cfg80211_regdomain's alpha2 when an intersection has occurred. * If the AP is reconfigured this can also be used to tell us if * the country on the country IE changed. */ char country_ie_alpha2[2]; /* * the driver requests the regulatory core to set this regulatory * domain as the wiphy's. Only used for %REGULATORY_WIPHY_SELF_MANAGED * devices using the regulatory_set_wiphy_regd() API */ const struct ieee80211_regdomain *requested_regd; /* If a Country IE has been received this tells us the environment * which its telling us its in. This defaults to ENVIRON_ANY */ enum environment_cap env; /* wiphy index, internal only */ int wiphy_idx; /* protected by RTNL */ int devlist_generation, wdev_id; int opencount; wait_queue_head_t dev_wait; struct list_head beacon_registrations; spinlock_t beacon_registrations_lock; /* protected by RTNL only */ int num_running_ifaces; int num_running_monitor_ifaces; u64 cookie_counter; /* BSSes/scanning */ spinlock_t bss_lock; struct list_head bss_list; struct rb_root bss_tree; u32 bss_generation; u32 bss_entries; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ struct cfg80211_scan_request *int_scan_req; struct sk_buff *scan_msg; struct list_head sched_scan_req_list; time64_t suspend_at; struct wiphy_work scan_done_wk; struct genl_info *cur_cmd_info; struct work_struct conn_work; struct work_struct event_work; struct delayed_work dfs_update_channels_wk; struct wireless_dev *background_radar_wdev; struct cfg80211_chan_def background_radar_chandef; struct delayed_work background_cac_done_wk; struct work_struct background_cac_abort_wk; /* netlink port which started critical protocol (0 means not started) */ u32 crit_proto_nlportid; struct cfg80211_coalesce *coalesce; struct work_struct destroy_work; struct wiphy_work sched_scan_stop_wk; struct work_struct sched_scan_res_wk; struct cfg80211_chan_def radar_chandef; struct work_struct propagate_radar_detect_wk; struct cfg80211_chan_def cac_done_chandef; struct work_struct propagate_cac_done_wk; struct work_struct mgmt_registrations_update_wk; /* lock for all wdev lists */ spinlock_t mgmt_registrations_lock; struct work_struct wiphy_work; struct list_head wiphy_work_list; /* protects the list above */ spinlock_t wiphy_work_lock; bool suspended; /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); }; static inline struct cfg80211_registered_device *wiphy_to_rdev(struct wiphy *wiphy) { BUG_ON(!wiphy); return container_of(wiphy, struct cfg80211_registered_device, wiphy); } static inline void cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev) { #ifdef CONFIG_PM int i; if (!rdev->wiphy.wowlan_config) return; for (i = 0; i < rdev->wiphy.wowlan_config->n_patterns; i++) kfree(rdev->wiphy.wowlan_config->patterns[i].mask); kfree(rdev->wiphy.wowlan_config->patterns); if (rdev->wiphy.wowlan_config->tcp && rdev->wiphy.wowlan_config->tcp->sock) sock_release(rdev->wiphy.wowlan_config->tcp->sock); kfree(rdev->wiphy.wowlan_config->tcp); kfree(rdev->wiphy.wowlan_config->nd_config); kfree(rdev->wiphy.wowlan_config); #endif } static inline u64 cfg80211_assign_cookie(struct cfg80211_registered_device *rdev) { u64 r = ++rdev->cookie_counter; if (WARN_ON(r == 0)) r = ++rdev->cookie_counter; return r; } extern struct workqueue_struct *cfg80211_wq; extern struct list_head cfg80211_rdev_list; extern int cfg80211_rdev_list_generation; /* This is constructed like this so it can be used in if/else */ static inline int for_each_rdev_check_rtnl(void) { ASSERT_RTNL(); return 0; } #define for_each_rdev(rdev) \ if (for_each_rdev_check_rtnl()) {} else \ list_for_each_entry(rdev, &cfg80211_rdev_list, list) struct cfg80211_internal_bss { struct list_head list; struct list_head hidden_list; struct rb_node rbn; u64 ts_boottime; unsigned long ts; unsigned long refcount; atomic_t hold; /* time at the start of the reception of the first octet of the * timestamp field of the last beacon/probe received for this BSS. * The time is the TSF of the BSS specified by %parent_bssid. */ u64 parent_tsf; /* the BSS according to which %parent_tsf is set. This is set to * the BSS that the interface that requested the scan was connected to * when the beacon/probe was received. */ u8 parent_bssid[ETH_ALEN] __aligned(2); /* must be last because of priv member */ struct cfg80211_bss pub; }; static inline struct cfg80211_internal_bss *bss_from_pub(struct cfg80211_bss *pub) { return container_of(pub, struct cfg80211_internal_bss, pub); } static inline void cfg80211_hold_bss(struct cfg80211_internal_bss *bss) { atomic_inc(&bss->hold); if (bss->pub.transmitted_bss) { bss = container_of(bss->pub.transmitted_bss, struct cfg80211_internal_bss, pub); atomic_inc(&bss->hold); } } static inline void cfg80211_unhold_bss(struct cfg80211_internal_bss *bss) { int r = atomic_dec_return(&bss->hold); WARN_ON(r < 0); if (bss->pub.transmitted_bss) { bss = container_of(bss->pub.transmitted_bss, struct cfg80211_internal_bss, pub); r = atomic_dec_return(&bss->hold); WARN_ON(r < 0); } } struct cfg80211_registered_device *cfg80211_rdev_by_wiphy_idx(int wiphy_idx); int get_wiphy_idx(struct wiphy *wiphy); struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx); int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, struct net *net); void cfg80211_init_wdev(struct wireless_dev *wdev); void cfg80211_register_wdev(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); static inline bool cfg80211_has_monitors_only(struct cfg80211_registered_device *rdev) { lockdep_assert_held(&rdev->wiphy.mtx); return rdev->num_running_ifaces == rdev->num_running_monitor_ifaces && rdev->num_running_ifaces > 0; } enum cfg80211_event_type { EVENT_CONNECT_RESULT, EVENT_ROAMED, EVENT_DISCONNECTED, EVENT_IBSS_JOINED, EVENT_STOPPED, EVENT_PORT_AUTHORIZED, }; struct cfg80211_event { struct list_head list; enum cfg80211_event_type type; union { struct cfg80211_connect_resp_params cr; struct cfg80211_roam_info rm; struct { const u8 *ie; size_t ie_len; u16 reason; bool locally_generated; } dc; struct { u8 bssid[ETH_ALEN]; struct ieee80211_channel *channel; } ij; struct { u8 peer_addr[ETH_ALEN]; const u8 *td_bitmap; u8 td_bitmap_len; } pa; }; }; struct cfg80211_cached_keys { struct key_params params[4]; u8 data[4][WLAN_KEY_LEN_WEP104]; int def; }; struct cfg80211_beacon_registration { struct list_head list; u32 nlportid; }; struct cfg80211_cqm_config { struct rcu_head rcu_head; u32 rssi_hyst; s32 last_rssi_event_value; enum nl80211_cqm_rssi_threshold_event last_rssi_event_type; bool use_range_api; int n_rssi_thresholds; s32 rssi_thresholds[] __counted_by(n_rssi_thresholds); }; void cfg80211_cqm_rssi_notify_work(struct wiphy *wiphy, struct wiphy_work *work); void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); /* free object */ void cfg80211_dev_free(struct cfg80211_registered_device *rdev); int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname); void ieee80211_set_bitrate_flags(struct wiphy *wiphy); void cfg80211_bss_expire(struct cfg80211_registered_device *rdev); void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs); void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, unsigned int link, struct ieee80211_channel *channel); /* IBSS */ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params, struct cfg80211_cached_keys *connkeys); void cfg80211_clear_ibss(struct net_device *dev, bool nowext); int cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext); void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, struct ieee80211_channel *channel); int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); /* mesh */ extern const struct mesh_config default_mesh_config; extern const struct mesh_setup default_mesh_setup; int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_setup *setup, const struct mesh_config *conf); int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef); /* OCB */ int cfg80211_join_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ocb_setup *setup); int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev); /* AP */ int cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, int link, bool notify); /* MLME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req); int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req); int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, bool local_state_change); int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *ap_addr, const u8 *ie, int ie_len, u16 reason, bool local_state_change); void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, int match_len, bool multicast_rx, struct netlink_ext_ack *extack); void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie); void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, const struct ieee80211_ht_cap *ht_capa_mask); void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, const struct ieee80211_vht_cap *vht_capa_mask); /* SME events */ int cfg80211_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *connect, struct cfg80211_cached_keys *connkeys, const u8 *prev_bssid); void __cfg80211_connect_result(struct net_device *dev, struct cfg80211_connect_resp_params *params, bool wextev); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap); int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev); void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info); void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr, const u8 *td_bitmap, u8 td_bitmap_len); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_autodisconnect_wk(struct work_struct *work); /* SME implementation */ void cfg80211_conn_work(struct work_struct *work); void cfg80211_sme_scan_done(struct net_device *dev); bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status); void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len); void cfg80211_sme_disassoc(struct wireless_dev *wdev); void cfg80211_sme_deauth(struct wireless_dev *wdev); void cfg80211_sme_auth_timeout(struct wireless_dev *wdev); void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev); void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev); /* internal helpers */ bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher); bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int key_idx, bool pairwise); int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr); void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk); void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message); void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req); int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, bool want_multi); void cfg80211_sched_scan_results_wk(struct work_struct *work); int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req, bool driver_initiated); int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, u64 reqid, bool driver_initiated); void cfg80211_upload_connect_keys(struct wireless_dev *wdev); int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params); void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); void cfg80211_process_wiphy_works(struct cfg80211_registered_device *rdev, struct wiphy_work *end); void cfg80211_process_wdev_events(struct wireless_dev *wdev); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz); int cfg80211_scan(struct cfg80211_registered_device *rdev); extern struct work_struct cfg80211_disconnect_work; void cfg80211_set_dfs_state(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, enum nl80211_dfs_state dfs_state); void cfg80211_dfs_channels_update_work(struct work_struct *work); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev); int cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef); void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev); void cfg80211_background_cac_done_wk(struct work_struct *work); void cfg80211_background_cac_abort_wk(struct work_struct *work); bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, struct ieee80211_channel *chan); bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev); bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, struct ieee80211_channel *chan, bool primary_only); bool cfg80211_wdev_on_sub_chan(struct wireless_dev *wdev, struct ieee80211_channel *chan, bool primary_only); static inline unsigned int elapsed_jiffies_msecs(unsigned long start) { unsigned long end = jiffies; if (end >= start) return jiffies_to_msecs(end - start); return jiffies_to_msecs(end + (ULONG_MAX - start) + 1); } int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask); int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, u32 beacon_int); void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num); void cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, bool signal_valid, unsigned long ts); #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) #else /* * Trick to enable using it as a condition, * and also not give a warning when it's * not used that way. */ #define CFG80211_DEV_WARN_ON(cond) ({bool __r = (cond); __r; }) #endif void cfg80211_release_pmsr(struct wireless_dev *wdev, u32 portid); void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev); void cfg80211_pmsr_free_wk(struct work_struct *work); void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id); void cfg80211_remove_links(struct wireless_dev *wdev); int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask); #endif /* __NET_WIRELESS_CORE_H */
3 1 3 12 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2019 Axis Communications AB * * Based on ttyprintk.c: * Copyright (C) 2010 Samo Pogacnik */ #include <linux/console.h> #include <linux/module.h> #include <linux/tty.h> static const struct tty_port_operations ttynull_port_ops; static struct tty_driver *ttynull_driver; static struct tty_port ttynull_port; static int ttynull_open(struct tty_struct *tty, struct file *filp) { return tty_port_open(&ttynull_port, tty, filp); } static void ttynull_close(struct tty_struct *tty, struct file *filp) { tty_port_close(&ttynull_port, tty, filp); } static void ttynull_hangup(struct tty_struct *tty) { tty_port_hangup(&ttynull_port); } static ssize_t ttynull_write(struct tty_struct *tty, const u8 *buf, size_t count) { return count; } static unsigned int ttynull_write_room(struct tty_struct *tty) { return 65536; } static const struct tty_operations ttynull_ops = { .open = ttynull_open, .close = ttynull_close, .hangup = ttynull_hangup, .write = ttynull_write, .write_room = ttynull_write_room, }; static struct tty_driver *ttynull_device(struct console *c, int *index) { *index = 0; return ttynull_driver; } static struct console ttynull_console = { .name = "ttynull", .device = ttynull_device, }; static int __init ttynull_init(void) { struct tty_driver *driver; int ret; driver = tty_alloc_driver(1, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_UNNUMBERED_NODE); if (IS_ERR(driver)) return PTR_ERR(driver); tty_port_init(&ttynull_port); ttynull_port.ops = &ttynull_port_ops; driver->driver_name = "ttynull"; driver->name = "ttynull"; driver->type = TTY_DRIVER_TYPE_CONSOLE; driver->init_termios = tty_std_termios; driver->init_termios.c_oflag = OPOST | OCRNL | ONOCR | ONLRET; tty_set_operations(driver, &ttynull_ops); tty_port_link_device(&ttynull_port, driver, 0); ret = tty_register_driver(driver); if (ret < 0) { tty_driver_kref_put(driver); tty_port_destroy(&ttynull_port); return ret; } ttynull_driver = driver; register_console(&ttynull_console); return 0; } static void __exit ttynull_exit(void) { unregister_console(&ttynull_console); tty_unregister_driver(ttynull_driver); tty_driver_kref_put(ttynull_driver); tty_port_destroy(&ttynull_port); } module_init(ttynull_init); module_exit(ttynull_exit); MODULE_LICENSE("GPL v2");
15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/clnt.c * * This file contains the high-level RPC interface. * It is modeled as a finite state machine to support both synchronous * and asynchronous requests. * * - RPC header generation and argument serialization. * - Credential refresh. * - TCP connect handling. * - Retry of operation when it is suspected the operation failed because * of uid squashing on the server, or when the credentials were stale * and need to be refreshed, or when a packet was damaged in transit. * This may be have to be moved to the VFS layer. * * Copyright (C) 1992,1993 Rick Sladkey <jrs@world.std.com> * Copyright (C) 1995,1996 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kallsyms.h> #include <linux/mm.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/utsname.h> #include <linux/workqueue.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/un.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/bc_xprt.h> #include <trace/events/sunrpc.h> #include "sunrpc.h" #include "sysfs.h" #include "netns.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_CALL #endif /* * All RPC clients are linked into this list */ static DECLARE_WAIT_QUEUE_HEAD(destroy_wait); static void call_start(struct rpc_task *task); static void call_reserve(struct rpc_task *task); static void call_reserveresult(struct rpc_task *task); static void call_allocate(struct rpc_task *task); static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); static void call_transmit(struct rpc_task *task); static void call_status(struct rpc_task *task); static void call_transmit_status(struct rpc_task *task); static void call_refresh(struct rpc_task *task); static void call_refreshresult(struct rpc_task *task); static void call_connect(struct rpc_task *task); static void call_connect_status(struct rpc_task *task); static int rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr); static int rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr); static int rpc_ping(struct rpc_clnt *clnt); static int rpc_ping_noreply(struct rpc_clnt *clnt); static void rpc_check_timeout(struct rpc_task *task); static void rpc_register_client(struct rpc_clnt *clnt) { struct net *net = rpc_net_ns(clnt); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); spin_lock(&sn->rpc_client_lock); list_add(&clnt->cl_clients, &sn->all_clients); spin_unlock(&sn->rpc_client_lock); } static void rpc_unregister_client(struct rpc_clnt *clnt) { struct net *net = rpc_net_ns(clnt); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); spin_lock(&sn->rpc_client_lock); list_del(&clnt->cl_clients); spin_unlock(&sn->rpc_client_lock); } static void __rpc_clnt_remove_pipedir(struct rpc_clnt *clnt) { rpc_remove_client_dir(clnt); } static void rpc_clnt_remove_pipedir(struct rpc_clnt *clnt) { struct net *net = rpc_net_ns(clnt); struct super_block *pipefs_sb; pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { if (pipefs_sb == clnt->pipefs_sb) __rpc_clnt_remove_pipedir(clnt); rpc_put_sb_net(net); } } static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb, struct rpc_clnt *clnt) { static uint32_t clntid; const char *dir_name = clnt->cl_program->pipe_dir_name; char name[15]; struct dentry *dir, *dentry; dir = rpc_d_lookup_sb(sb, dir_name); if (dir == NULL) { pr_info("RPC: pipefs directory doesn't exist: %s\n", dir_name); return dir; } for (;;) { snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++); name[sizeof(name) - 1] = '\0'; dentry = rpc_create_client_dir(dir, name, clnt); if (!IS_ERR(dentry)) break; if (dentry == ERR_PTR(-EEXIST)) continue; printk(KERN_INFO "RPC: Couldn't create pipefs entry" " %s/%s, error %ld\n", dir_name, name, PTR_ERR(dentry)); break; } dput(dir); return dentry; } static int rpc_setup_pipedir(struct super_block *pipefs_sb, struct rpc_clnt *clnt) { struct dentry *dentry; clnt->pipefs_sb = pipefs_sb; if (clnt->cl_program->pipe_dir_name != NULL) { dentry = rpc_setup_pipedir_sb(pipefs_sb, clnt); if (IS_ERR(dentry)) return PTR_ERR(dentry); } return 0; } static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) { if (clnt->cl_program->pipe_dir_name == NULL) return 1; switch (event) { case RPC_PIPEFS_MOUNT: if (clnt->cl_pipedir_objects.pdh_dentry != NULL) return 1; if (refcount_read(&clnt->cl_count) == 0) return 1; break; case RPC_PIPEFS_UMOUNT: if (clnt->cl_pipedir_objects.pdh_dentry == NULL) return 1; break; } return 0; } static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { struct dentry *dentry; switch (event) { case RPC_PIPEFS_MOUNT: dentry = rpc_setup_pipedir_sb(sb, clnt); if (!dentry) return -ENOENT; if (IS_ERR(dentry)) return PTR_ERR(dentry); break; case RPC_PIPEFS_UMOUNT: __rpc_clnt_remove_pipedir(clnt); break; default: printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event); return -ENOTSUPP; } return 0; } static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event, struct super_block *sb) { int error = 0; for (;; clnt = clnt->cl_parent) { if (!rpc_clnt_skip_event(clnt, event)) error = __rpc_clnt_handle_event(clnt, event, sb); if (error || clnt == clnt->cl_parent) break; } return error; } static struct rpc_clnt *rpc_get_client_for_event(struct net *net, int event) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; spin_lock(&sn->rpc_client_lock); list_for_each_entry(clnt, &sn->all_clients, cl_clients) { if (rpc_clnt_skip_event(clnt, event)) continue; spin_unlock(&sn->rpc_client_lock); return clnt; } spin_unlock(&sn->rpc_client_lock); return NULL; } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct super_block *sb = ptr; struct rpc_clnt *clnt; int error = 0; while ((clnt = rpc_get_client_for_event(sb->s_fs_info, event))) { error = __rpc_pipefs_event(clnt, event, sb); if (error) break; } return error; } static struct notifier_block rpc_clients_block = { .notifier_call = rpc_pipefs_event, .priority = SUNRPC_PIPEFS_RPC_PRIO, }; int rpc_clients_notifier_register(void) { return rpc_pipefs_notifier_register(&rpc_clients_block); } void rpc_clients_notifier_unregister(void) { return rpc_pipefs_notifier_unregister(&rpc_clients_block); } static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, struct rpc_xprt *xprt, const struct rpc_timeout *timeout) { struct rpc_xprt *old; spin_lock(&clnt->cl_lock); old = rcu_dereference_protected(clnt->cl_xprt, lockdep_is_held(&clnt->cl_lock)); if (!xprt_bound(xprt)) clnt->cl_autobind = 1; clnt->cl_timeout = timeout; rcu_assign_pointer(clnt->cl_xprt, xprt); spin_unlock(&clnt->cl_lock); return old; } static void rpc_clnt_set_nodename(struct rpc_clnt *clnt, const char *nodename) { clnt->cl_nodelen = strlcpy(clnt->cl_nodename, nodename, sizeof(clnt->cl_nodename)); } static int rpc_client_register(struct rpc_clnt *clnt, rpc_authflavor_t pseudoflavor, const char *client_name) { struct rpc_auth_create_args auth_args = { .pseudoflavor = pseudoflavor, .target_name = client_name, }; struct rpc_auth *auth; struct net *net = rpc_net_ns(clnt); struct super_block *pipefs_sb; int err; rpc_clnt_debugfs_register(clnt); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { err = rpc_setup_pipedir(pipefs_sb, clnt); if (err) goto out; } rpc_register_client(clnt); if (pipefs_sb) rpc_put_sb_net(net); auth = rpcauth_create(&auth_args, clnt); if (IS_ERR(auth)) { dprintk("RPC: Couldn't create auth handle (flavor %u)\n", pseudoflavor); err = PTR_ERR(auth); goto err_auth; } return 0; err_auth: pipefs_sb = rpc_get_sb_net(net); rpc_unregister_client(clnt); __rpc_clnt_remove_pipedir(clnt); out: if (pipefs_sb) rpc_put_sb_net(net); rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); return err; } static DEFINE_IDA(rpc_clids); void rpc_cleanup_clids(void) { ida_destroy(&rpc_clids); } static int rpc_alloc_clid(struct rpc_clnt *clnt) { int clid; clid = ida_alloc(&rpc_clids, GFP_KERNEL); if (clid < 0) return clid; clnt->cl_clid = clid; return 0; } static void rpc_free_clid(struct rpc_clnt *clnt) { ida_free(&rpc_clids, clnt->cl_clid); } static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, struct rpc_clnt *parent) { const struct rpc_program *program = args->program; const struct rpc_version *version; struct rpc_clnt *clnt = NULL; const struct rpc_timeout *timeout; const char *nodename = args->nodename; int err; err = rpciod_up(); if (err) goto out_no_rpciod; err = -EINVAL; if (args->version >= program->nrvers) goto out_err; version = program->version[args->version]; if (version == NULL) goto out_err; err = -ENOMEM; clnt = kzalloc(sizeof(*clnt), GFP_KERNEL); if (!clnt) goto out_err; clnt->cl_parent = parent ? : clnt; clnt->cl_xprtsec = args->xprtsec; err = rpc_alloc_clid(clnt); if (err) goto out_no_clid; clnt->cl_cred = get_cred(args->cred); clnt->cl_procinfo = version->procs; clnt->cl_maxproc = version->nrprocs; clnt->cl_prog = args->prognumber ? : program->number; clnt->cl_vers = version->number; clnt->cl_stats = program->stats; clnt->cl_metrics = rpc_alloc_iostats(clnt); rpc_init_pipe_dir_head(&clnt->cl_pipedir_objects); err = -ENOMEM; if (clnt->cl_metrics == NULL) goto out_no_stats; clnt->cl_program = program; INIT_LIST_HEAD(&clnt->cl_tasks); spin_lock_init(&clnt->cl_lock); timeout = xprt->timeout; if (args->timeout != NULL) { memcpy(&clnt->cl_timeout_default, args->timeout, sizeof(clnt->cl_timeout_default)); timeout = &clnt->cl_timeout_default; } rpc_clnt_set_transport(clnt, xprt, timeout); xprt->main = true; xprt_iter_init(&clnt->cl_xpi, xps); xprt_switch_put(xps); clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); refcount_set(&clnt->cl_count, 1); if (nodename == NULL) nodename = utsname()->nodename; /* save the nodename */ rpc_clnt_set_nodename(clnt, nodename); rpc_sysfs_client_setup(clnt, xps, rpc_net_ns(clnt)); err = rpc_client_register(clnt, args->authflavor, args->client_name); if (err) goto out_no_path; if (parent) refcount_inc(&parent->cl_count); trace_rpc_clnt_new(clnt, xprt, args); return clnt; out_no_path: rpc_free_iostats(clnt->cl_metrics); out_no_stats: put_cred(clnt->cl_cred); rpc_free_clid(clnt); out_no_clid: kfree(clnt); out_err: rpciod_down(); out_no_rpciod: xprt_switch_put(xps); xprt_put(xprt); trace_rpc_clnt_new_err(program->name, args->servername, err); return ERR_PTR(err); } static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, struct rpc_xprt *xprt) { struct rpc_clnt *clnt = NULL; struct rpc_xprt_switch *xps; if (args->bc_xprt && args->bc_xprt->xpt_bc_xps) { WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC)); xps = args->bc_xprt->xpt_bc_xps; xprt_switch_get(xps); } else { xps = xprt_switch_alloc(xprt, GFP_KERNEL); if (xps == NULL) { xprt_put(xprt); return ERR_PTR(-ENOMEM); } if (xprt->bc_xprt) { xprt_switch_get(xps); xprt->bc_xprt->xpt_bc_xps = xps; } } clnt = rpc_new_client(args, xps, xprt, NULL); if (IS_ERR(clnt)) return clnt; if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { int err = rpc_ping(clnt); if (err != 0) { rpc_shutdown_client(clnt); return ERR_PTR(err); } } else if (args->flags & RPC_CLNT_CREATE_CONNECTED) { int err = rpc_ping_noreply(clnt); if (err != 0) { rpc_shutdown_client(clnt); return ERR_PTR(err); } } clnt->cl_softrtry = 1; if (args->flags & (RPC_CLNT_CREATE_HARDRTRY|RPC_CLNT_CREATE_SOFTERR)) { clnt->cl_softrtry = 0; if (args->flags & RPC_CLNT_CREATE_SOFTERR) clnt->cl_softerr = 1; } if (args->flags & RPC_CLNT_CREATE_AUTOBIND) clnt->cl_autobind = 1; if (args->flags & RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT) clnt->cl_noretranstimeo = 1; if (args->flags & RPC_CLNT_CREATE_DISCRTRY) clnt->cl_discrtry = 1; if (!(args->flags & RPC_CLNT_CREATE_QUIET)) clnt->cl_chatty = 1; return clnt; } /** * rpc_create - create an RPC client and transport with one call * @args: rpc_clnt create argument structure * * Creates and initializes an RPC transport and an RPC client. * * It can ping the server in order to determine if it is up, and to see if * it supports this program and version. RPC_CLNT_CREATE_NOPING disables * this behavior so asynchronous tasks can also use rpc_create. */ struct rpc_clnt *rpc_create(struct rpc_create_args *args) { struct rpc_xprt *xprt; struct xprt_create xprtargs = { .net = args->net, .ident = args->protocol, .srcaddr = args->saddress, .dstaddr = args->address, .addrlen = args->addrsize, .servername = args->servername, .bc_xprt = args->bc_xprt, .xprtsec = args->xprtsec, .connect_timeout = args->connect_timeout, .reconnect_timeout = args->reconnect_timeout, }; char servername[48]; struct rpc_clnt *clnt; int i; if (args->bc_xprt) { WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC)); xprt = args->bc_xprt->xpt_bc_xprt; if (xprt) { xprt_get(xprt); return rpc_create_xprt(args, xprt); } } if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS) xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS; if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT) xprtargs.flags |= XPRT_CREATE_NO_IDLE_TIMEOUT; /* * If the caller chooses not to specify a hostname, whip * up a string representation of the passed-in address. */ if (xprtargs.servername == NULL) { struct sockaddr_un *sun = (struct sockaddr_un *)args->address; struct sockaddr_in *sin = (struct sockaddr_in *)args->address; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)args->address; servername[0] = '\0'; switch (args->address->sa_family) { case AF_LOCAL: if (sun->sun_path[0]) snprintf(servername, sizeof(servername), "%s", sun->sun_path); else snprintf(servername, sizeof(servername), "@%s", sun->sun_path+1); break; case AF_INET: snprintf(servername, sizeof(servername), "%pI4", &sin->sin_addr.s_addr); break; case AF_INET6: snprintf(servername, sizeof(servername), "%pI6", &sin6->sin6_addr); break; default: /* caller wants default server name, but * address family isn't recognized. */ return ERR_PTR(-EINVAL); } xprtargs.servername = servername; } xprt = xprt_create_transport(&xprtargs); if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; /* * By default, kernel RPC client connects from a reserved port. * CAP_NET_BIND_SERVICE will not be set for unprivileged requesters, * but it is always enabled for rpciod, which handles the connect * operation. */ xprt->resvport = 1; if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) xprt->resvport = 0; xprt->reuseport = 0; if (args->flags & RPC_CLNT_CREATE_REUSEPORT) xprt->reuseport = 1; clnt = rpc_create_xprt(args, xprt); if (IS_ERR(clnt) || args->nconnect <= 1) return clnt; for (i = 0; i < args->nconnect - 1; i++) { if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0) break; } return clnt; } EXPORT_SYMBOL_GPL(rpc_create); /* * This function clones the RPC client structure. It allows us to share the * same transport while varying parameters such as the authentication * flavour. */ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, struct rpc_clnt *clnt) { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; struct rpc_clnt *new; int err; err = -ENOMEM; rcu_read_lock(); xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); if (xprt == NULL || xps == NULL) { xprt_put(xprt); xprt_switch_put(xps); goto out_err; } args->servername = xprt->servername; args->nodename = clnt->cl_nodename; new = rpc_new_client(args, xps, xprt, clnt); if (IS_ERR(new)) return new; /* Turn off autobind on clones */ new->cl_autobind = 0; new->cl_softrtry = clnt->cl_softrtry; new->cl_softerr = clnt->cl_softerr; new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; new->cl_principal = clnt->cl_principal; new->cl_max_connect = clnt->cl_max_connect; return new; out_err: trace_rpc_clnt_clone_err(clnt, err); return ERR_PTR(err); } /** * rpc_clone_client - Clone an RPC client structure * * @clnt: RPC client whose parameters are copied * * Returns a fresh RPC client or an ERR_PTR. */ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) { struct rpc_create_args args = { .program = clnt->cl_program, .prognumber = clnt->cl_prog, .version = clnt->cl_vers, .authflavor = clnt->cl_auth->au_flavor, .cred = clnt->cl_cred, }; return __rpc_clone_client(&args, clnt); } EXPORT_SYMBOL_GPL(rpc_clone_client); /** * rpc_clone_client_set_auth - Clone an RPC client structure and set its auth * * @clnt: RPC client whose parameters are copied * @flavor: security flavor for new client * * Returns a fresh RPC client or an ERR_PTR. */ struct rpc_clnt * rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) { struct rpc_create_args args = { .program = clnt->cl_program, .prognumber = clnt->cl_prog, .version = clnt->cl_vers, .authflavor = flavor, .cred = clnt->cl_cred, }; return __rpc_clone_client(&args, clnt); } EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth); /** * rpc_switch_client_transport: switch the RPC transport on the fly * @clnt: pointer to a struct rpc_clnt * @args: pointer to the new transport arguments * @timeout: pointer to the new timeout parameters * * This function allows the caller to switch the RPC transport for the * rpc_clnt structure 'clnt' to allow it to connect to a mirrored NFS * server, for instance. It assumes that the caller has ensured that * there are no active RPC tasks by using some form of locking. * * Returns zero if "clnt" is now using the new xprt. Otherwise a * negative errno is returned, and "clnt" continues to use the old * xprt. */ int rpc_switch_client_transport(struct rpc_clnt *clnt, struct xprt_create *args, const struct rpc_timeout *timeout) { const struct rpc_timeout *old_timeo; rpc_authflavor_t pseudoflavor; struct rpc_xprt_switch *xps, *oldxps; struct rpc_xprt *xprt, *old; struct rpc_clnt *parent; int err; args->xprtsec = clnt->cl_xprtsec; xprt = xprt_create_transport(args); if (IS_ERR(xprt)) return PTR_ERR(xprt); xps = xprt_switch_alloc(xprt, GFP_KERNEL); if (xps == NULL) { xprt_put(xprt); return -ENOMEM; } pseudoflavor = clnt->cl_auth->au_flavor; old_timeo = clnt->cl_timeout; old = rpc_clnt_set_transport(clnt, xprt, timeout); oldxps = xprt_iter_xchg_switch(&clnt->cl_xpi, xps); rpc_unregister_client(clnt); __rpc_clnt_remove_pipedir(clnt); rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); /* * A new transport was created. "clnt" therefore * becomes the root of a new cl_parent tree. clnt's * children, if it has any, still point to the old xprt. */ parent = clnt->cl_parent; clnt->cl_parent = clnt; /* * The old rpc_auth cache cannot be re-used. GSS * contexts in particular are between a single * client and server. */ err = rpc_client_register(clnt, pseudoflavor, NULL); if (err) goto out_revert; synchronize_rcu(); if (parent != clnt) rpc_release_client(parent); xprt_switch_put(oldxps); xprt_put(old); trace_rpc_clnt_replace_xprt(clnt); return 0; out_revert: xps = xprt_iter_xchg_switch(&clnt->cl_xpi, oldxps); rpc_clnt_set_transport(clnt, old, old_timeo); clnt->cl_parent = parent; rpc_client_register(clnt, pseudoflavor, NULL); xprt_switch_put(xps); xprt_put(xprt); trace_rpc_clnt_replace_xprt_err(clnt); return err; } EXPORT_SYMBOL_GPL(rpc_switch_client_transport); static int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi, void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps)) { struct rpc_xprt_switch *xps; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); if (xps == NULL) return -EAGAIN; func(xpi, xps); xprt_switch_put(xps); return 0; } static int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) { return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall); } static int rpc_clnt_xprt_iter_offline_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) { return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listoffline); } /** * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports * @clnt: pointer to client * @fn: function to apply * @data: void pointer to function data * * Iterates through the list of RPC transports currently attached to the * client and applies the function fn(clnt, xprt, data). * * On error, the iteration stops, and the function returns the error value. */ int rpc_clnt_iterate_for_each_xprt(struct rpc_clnt *clnt, int (*fn)(struct rpc_clnt *, struct rpc_xprt *, void *), void *data) { struct rpc_xprt_iter xpi; int ret; ret = rpc_clnt_xprt_iter_init(clnt, &xpi); if (ret) return ret; for (;;) { struct rpc_xprt *xprt = xprt_iter_get_next(&xpi); if (!xprt) break; ret = fn(clnt, xprt, data); xprt_put(xprt); if (ret < 0) break; } xprt_iter_destroy(&xpi); return ret; } EXPORT_SYMBOL_GPL(rpc_clnt_iterate_for_each_xprt); /* * Kill all tasks for the given client. * XXX: kill their descendants as well? */ void rpc_killall_tasks(struct rpc_clnt *clnt) { struct rpc_task *rovr; if (list_empty(&clnt->cl_tasks)) return; /* * Spin lock all_tasks to prevent changes... */ trace_rpc_clnt_killall(clnt); spin_lock(&clnt->cl_lock); list_for_each_entry(rovr, &clnt->cl_tasks, tk_task) rpc_signal_task(rovr); spin_unlock(&clnt->cl_lock); } EXPORT_SYMBOL_GPL(rpc_killall_tasks); /** * rpc_cancel_tasks - try to cancel a set of RPC tasks * @clnt: Pointer to RPC client * @error: RPC task error value to set * @fnmatch: Pointer to selector function * @data: User data * * Uses @fnmatch to define a set of RPC tasks that are to be cancelled. * The argument @error must be a negative error value. */ unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, bool (*fnmatch)(const struct rpc_task *, const void *), const void *data) { struct rpc_task *task; unsigned long count = 0; if (list_empty(&clnt->cl_tasks)) return 0; /* * Spin lock all_tasks to prevent changes... */ spin_lock(&clnt->cl_lock); list_for_each_entry(task, &clnt->cl_tasks, tk_task) { if (!RPC_IS_ACTIVATED(task)) continue; if (!fnmatch(task, data)) continue; rpc_task_try_cancel(task, error); count++; } spin_unlock(&clnt->cl_lock); return count; } EXPORT_SYMBOL_GPL(rpc_cancel_tasks); static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *dummy) { if (xprt_connected(xprt)) xprt_force_disconnect(xprt); return 0; } void rpc_clnt_disconnect(struct rpc_clnt *clnt) { rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_disconnect_xprt, NULL); } EXPORT_SYMBOL_GPL(rpc_clnt_disconnect); /* * Properly shut down an RPC client, terminating all outstanding * requests. */ void rpc_shutdown_client(struct rpc_clnt *clnt) { might_sleep(); trace_rpc_clnt_shutdown(clnt); while (!list_empty(&clnt->cl_tasks)) { rpc_killall_tasks(clnt); wait_event_timeout(destroy_wait, list_empty(&clnt->cl_tasks), 1*HZ); } rpc_release_client(clnt); } EXPORT_SYMBOL_GPL(rpc_shutdown_client); /* * Free an RPC client */ static void rpc_free_client_work(struct work_struct *work) { struct rpc_clnt *clnt = container_of(work, struct rpc_clnt, cl_work); trace_rpc_clnt_free(clnt); /* These might block on processes that might allocate memory, * so they cannot be called in rpciod, so they are handled separately * here. */ rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); rpc_free_clid(clnt); rpc_clnt_remove_pipedir(clnt); xprt_put(rcu_dereference_raw(clnt->cl_xprt)); kfree(clnt); rpciod_down(); } static struct rpc_clnt * rpc_free_client(struct rpc_clnt *clnt) { struct rpc_clnt *parent = NULL; trace_rpc_clnt_release(clnt); if (clnt->cl_parent != clnt) parent = clnt->cl_parent; rpc_unregister_client(clnt); rpc_free_iostats(clnt->cl_metrics); clnt->cl_metrics = NULL; xprt_iter_destroy(&clnt->cl_xpi); put_cred(clnt->cl_cred); INIT_WORK(&clnt->cl_work, rpc_free_client_work); schedule_work(&clnt->cl_work); return parent; } /* * Free an RPC client */ static struct rpc_clnt * rpc_free_auth(struct rpc_clnt *clnt) { /* * Note: RPCSEC_GSS may need to send NULL RPC calls in order to * release remaining GSS contexts. This mechanism ensures * that it can do so safely. */ if (clnt->cl_auth != NULL) { rpcauth_release(clnt->cl_auth); clnt->cl_auth = NULL; } if (refcount_dec_and_test(&clnt->cl_count)) return rpc_free_client(clnt); return NULL; } /* * Release reference to the RPC client */ void rpc_release_client(struct rpc_clnt *clnt) { do { if (list_empty(&clnt->cl_tasks)) wake_up(&destroy_wait); if (refcount_dec_not_one(&clnt->cl_count)) break; clnt = rpc_free_auth(clnt); } while (clnt != NULL); } EXPORT_SYMBOL_GPL(rpc_release_client); /** * rpc_bind_new_program - bind a new RPC program to an existing client * @old: old rpc_client * @program: rpc program to set * @vers: rpc program version * * Clones the rpc client and sets up a new RPC program. This is mainly * of use for enabling different RPC programs to share the same transport. * The Sun NFSv2/v3 ACL protocol can do this. */ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, const struct rpc_program *program, u32 vers) { struct rpc_create_args args = { .program = program, .prognumber = program->number, .version = vers, .authflavor = old->cl_auth->au_flavor, .cred = old->cl_cred, }; struct rpc_clnt *clnt; int err; clnt = __rpc_clone_client(&args, old); if (IS_ERR(clnt)) goto out; err = rpc_ping(clnt); if (err != 0) { rpc_shutdown_client(clnt); clnt = ERR_PTR(err); } out: return clnt; } EXPORT_SYMBOL_GPL(rpc_bind_new_program); struct rpc_xprt * rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; if (!xprt) return NULL; rcu_read_lock(); xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); atomic_long_inc(&xps->xps_queuelen); rcu_read_unlock(); atomic_long_inc(&xprt->queuelen); return xprt; } static void rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; atomic_long_dec(&xprt->queuelen); rcu_read_lock(); xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); atomic_long_dec(&xps->xps_queuelen); rcu_read_unlock(); xprt_put(xprt); } void rpc_task_release_transport(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; if (xprt) { task->tk_xprt = NULL; if (task->tk_client) rpc_task_release_xprt(task->tk_client, xprt); else xprt_put(xprt); } } EXPORT_SYMBOL_GPL(rpc_task_release_transport); void rpc_task_release_client(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; rpc_task_release_transport(task); if (clnt != NULL) { /* Remove from client task list */ spin_lock(&clnt->cl_lock); list_del(&task->tk_task); spin_unlock(&clnt->cl_lock); task->tk_client = NULL; rpc_release_client(clnt); } } static struct rpc_xprt * rpc_task_get_first_xprt(struct rpc_clnt *clnt) { struct rpc_xprt *xprt; rcu_read_lock(); xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); rcu_read_unlock(); return rpc_task_get_xprt(clnt, xprt); } static struct rpc_xprt * rpc_task_get_next_xprt(struct rpc_clnt *clnt) { return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi)); } static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { if (task->tk_xprt) { if (!(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) && (task->tk_flags & RPC_TASK_MOVEABLE))) return; xprt_release(task); xprt_put(task->tk_xprt); } if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) task->tk_xprt = rpc_task_get_first_xprt(clnt); else task->tk_xprt = rpc_task_get_next_xprt(clnt); } static void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) { rpc_task_set_transport(task, clnt); task->tk_client = clnt; refcount_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; if (clnt->cl_softerr) task->tk_flags |= RPC_TASK_TIMEOUT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; /* Add to the client's list of all tasks */ spin_lock(&clnt->cl_lock); list_add_tail(&task->tk_task, &clnt->cl_tasks); spin_unlock(&clnt->cl_lock); } static void rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) { if (msg != NULL) { task->tk_msg.rpc_proc = msg->rpc_proc; task->tk_msg.rpc_argp = msg->rpc_argp; task->tk_msg.rpc_resp = msg->rpc_resp; task->tk_msg.rpc_cred = msg->rpc_cred; if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) get_cred(task->tk_msg.rpc_cred); } } /* * Default callback for async RPC calls */ static void rpc_default_callback(struct rpc_task *task, void *data) { } static const struct rpc_call_ops rpc_default_ops = { .rpc_call_done = rpc_default_callback, }; /** * rpc_run_task - Allocate a new RPC task, then run rpc_execute against it * @task_setup_data: pointer to task initialisation data */ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) { struct rpc_task *task; task = rpc_new_task(task_setup_data); if (IS_ERR(task)) return task; if (!RPC_IS_ASYNC(task)) task->tk_flags |= RPC_TASK_CRED_NOREF; rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); if (task->tk_action == NULL) rpc_call_start(task); atomic_inc(&task->tk_count); rpc_execute(task); return task; } EXPORT_SYMBOL_GPL(rpc_run_task); /** * rpc_call_sync - Perform a synchronous RPC call * @clnt: pointer to RPC client * @msg: RPC call parameters * @flags: RPC call flags */ int rpc_call_sync(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, .rpc_message = msg, .callback_ops = &rpc_default_ops, .flags = flags, }; int status; WARN_ON_ONCE(flags & RPC_TASK_ASYNC); if (flags & RPC_TASK_ASYNC) { rpc_release_calldata(task_setup_data.callback_ops, task_setup_data.callback_data); return -EINVAL; } task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); status = task->tk_status; rpc_put_task(task); return status; } EXPORT_SYMBOL_GPL(rpc_call_sync); /** * rpc_call_async - Perform an asynchronous RPC call * @clnt: pointer to RPC client * @msg: RPC call parameters * @flags: RPC call flags * @tk_ops: RPC call ops * @data: user call data */ int rpc_call_async(struct rpc_clnt *clnt, const struct rpc_message *msg, int flags, const struct rpc_call_ops *tk_ops, void *data) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, .rpc_message = msg, .callback_ops = tk_ops, .callback_data = data, .flags = flags|RPC_TASK_ASYNC, }; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); return 0; } EXPORT_SYMBOL_GPL(rpc_call_async); #if defined(CONFIG_SUNRPC_BACKCHANNEL) static void call_bc_encode(struct rpc_task *task); /** * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run * rpc_execute against it * @req: RPC request */ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) { struct rpc_task *task; struct rpc_task_setup task_setup_data = { .callback_ops = &rpc_default_ops, .flags = RPC_TASK_SOFTCONN | RPC_TASK_NO_RETRANS_TIMEOUT, }; dprintk("RPC: rpc_run_bc_task req= %p\n", req); /* * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); if (IS_ERR(task)) { xprt_free_bc_request(req); return task; } xprt_init_bc_request(req, task); task->tk_action = call_bc_encode; atomic_inc(&task->tk_count); WARN_ON_ONCE(atomic_read(&task->tk_count) != 2); rpc_execute(task); dprintk("RPC: rpc_run_bc_task: task= %p\n", task); return task; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** * rpc_prepare_reply_pages - Prepare to receive a reply data payload into pages * @req: RPC request to prepare * @pages: vector of struct page pointers * @base: offset in first page where receive should start, in bytes * @len: expected size of the upper layer data payload, in bytes * @hdrsize: expected size of upper layer reply header, in XDR words * */ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize) { hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign; xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len); trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf); } EXPORT_SYMBOL_GPL(rpc_prepare_reply_pages); void rpc_call_start(struct rpc_task *task) { task->tk_action = call_start; } EXPORT_SYMBOL_GPL(rpc_call_start); /** * rpc_peeraddr - extract remote peer address from clnt's xprt * @clnt: RPC client structure * @buf: target buffer * @bufsize: length of target buffer * * Returns the number of bytes that are actually in the stored address. */ size_t rpc_peeraddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t bufsize) { size_t bytes; struct rpc_xprt *xprt; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); bytes = xprt->addrlen; if (bytes > bufsize) bytes = bufsize; memcpy(buf, &xprt->addr, bytes); rcu_read_unlock(); return bytes; } EXPORT_SYMBOL_GPL(rpc_peeraddr); /** * rpc_peeraddr2str - return remote peer address in printable format * @clnt: RPC client structure * @format: address format * * NB: the lifetime of the memory referenced by the returned pointer is * the same as the rpc_xprt itself. As long as the caller uses this * pointer, it must hold the RCU read lock. */ const char *rpc_peeraddr2str(struct rpc_clnt *clnt, enum rpc_display_format_t format) { struct rpc_xprt *xprt; xprt = rcu_dereference(clnt->cl_xprt); if (xprt->address_strings[format] != NULL) return xprt->address_strings[format]; else return "unprintable"; } EXPORT_SYMBOL_GPL(rpc_peeraddr2str); static const struct sockaddr_in rpc_inaddr_loopback = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; static const struct sockaddr_in6 rpc_in6addr_loopback = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, }; /* * Try a getsockname() on a connected datagram socket. Using a * connected datagram socket prevents leaving a socket in TIME_WAIT. * This conserves the ephemeral port number space. * * Returns zero and fills in "buf" if successful; otherwise, a * negative errno is returned. */ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, struct sockaddr *buf) { struct socket *sock; int err; err = __sock_create(net, sap->sa_family, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); if (err < 0) { dprintk("RPC: can't create UDP socket (%d)\n", err); goto out; } switch (sap->sa_family) { case AF_INET: err = kernel_bind(sock, (struct sockaddr *)&rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: err = kernel_bind(sock, (struct sockaddr *)&rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: err = -EAFNOSUPPORT; goto out_release; } if (err < 0) { dprintk("RPC: can't bind UDP socket (%d)\n", err); goto out_release; } err = kernel_connect(sock, sap, salen, 0); if (err < 0) { dprintk("RPC: can't connect UDP socket (%d)\n", err); goto out_release; } err = kernel_getsockname(sock, buf); if (err < 0) { dprintk("RPC: getsockname failed (%d)\n", err); goto out_release; } err = 0; if (buf->sa_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)buf; sin6->sin6_scope_id = 0; } dprintk("RPC: %s succeeded\n", __func__); out_release: sock_release(sock); out: return err; } /* * Scraping a connected socket failed, so we don't have a useable * local address. Fallback: generate an address that will prevent * the server from calling us back. * * Returns zero and fills in "buf" if successful; otherwise, a * negative errno is returned. */ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen) { switch (family) { case AF_INET: if (buflen < sizeof(rpc_inaddr_loopback)) return -EINVAL; memcpy(buf, &rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: if (buflen < sizeof(rpc_in6addr_loopback)) return -EINVAL; memcpy(buf, &rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: dprintk("RPC: %s: address family not supported\n", __func__); return -EAFNOSUPPORT; } dprintk("RPC: %s: succeeded\n", __func__); return 0; } /** * rpc_localaddr - discover local endpoint address for an RPC client * @clnt: RPC client structure * @buf: target buffer * @buflen: size of target buffer, in bytes * * Returns zero and fills in "buf" and "buflen" if successful; * otherwise, a negative errno is returned. * * This works even if the underlying transport is not currently connected, * or if the upper layer never previously provided a source address. * * The result of this function call is transient: multiple calls in * succession may give different results, depending on how local * networking configuration changes over time. */ int rpc_localaddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t buflen) { struct sockaddr_storage address; struct sockaddr *sap = (struct sockaddr *)&address; struct rpc_xprt *xprt; struct net *net; size_t salen; int err; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); salen = xprt->addrlen; memcpy(sap, &xprt->addr, salen); net = get_net(xprt->xprt_net); rcu_read_unlock(); rpc_set_port(sap, 0); err = rpc_sockname(net, sap, salen, buf); put_net(net); if (err != 0) /* Couldn't discover local address, return ANYADDR */ return rpc_anyaddr(sap->sa_family, buf, buflen); return 0; } EXPORT_SYMBOL_GPL(rpc_localaddr); void rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) { struct rpc_xprt *xprt; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); if (xprt->ops->set_buffer_size) xprt->ops->set_buffer_size(xprt, sndsize, rcvsize); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rpc_setbufsize); /** * rpc_net_ns - Get the network namespace for this RPC client * @clnt: RPC client to query * */ struct net *rpc_net_ns(struct rpc_clnt *clnt) { struct net *ret; rcu_read_lock(); ret = rcu_dereference(clnt->cl_xprt)->xprt_net; rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_net_ns); /** * rpc_max_payload - Get maximum payload size for a transport, in bytes * @clnt: RPC client to query * * For stream transports, this is one RPC record fragment (see RFC * 1831), as we don't support multi-record requests yet. For datagram * transports, this is the size of an IP packet minus the IP, UDP, and * RPC header sizes. */ size_t rpc_max_payload(struct rpc_clnt *clnt) { size_t ret; rcu_read_lock(); ret = rcu_dereference(clnt->cl_xprt)->max_payload; rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_max_payload); /** * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes * @clnt: RPC client to query */ size_t rpc_max_bc_payload(struct rpc_clnt *clnt) { struct rpc_xprt *xprt; size_t ret; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); ret = xprt->ops->bc_maxpayload(xprt); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_max_bc_payload); unsigned int rpc_num_bc_slots(struct rpc_clnt *clnt) { struct rpc_xprt *xprt; unsigned int ret; rcu_read_lock(); xprt = rcu_dereference(clnt->cl_xprt); ret = xprt->ops->bc_num_slots(xprt); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_num_bc_slots); /** * rpc_force_rebind - force transport to check that remote port is unchanged * @clnt: client to rebind * */ void rpc_force_rebind(struct rpc_clnt *clnt) { if (clnt->cl_autobind) { rcu_read_lock(); xprt_clear_bound(rcu_dereference(clnt->cl_xprt)); rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(rpc_force_rebind); static int __rpc_restart_call(struct rpc_task *task, void (*action)(struct rpc_task *)) { task->tk_status = 0; task->tk_rpc_status = 0; task->tk_action = action; return 1; } /* * Restart an (async) RPC call. Usually called from within the * exit handler. */ int rpc_restart_call(struct rpc_task *task) { return __rpc_restart_call(task, call_start); } EXPORT_SYMBOL_GPL(rpc_restart_call); /* * Restart an (async) RPC call from the call_prepare state. * Usually called from within the exit handler. */ int rpc_restart_call_prepare(struct rpc_task *task) { if (task->tk_ops->rpc_call_prepare != NULL) return __rpc_restart_call(task, rpc_prepare_task); return rpc_restart_call(task); } EXPORT_SYMBOL_GPL(rpc_restart_call_prepare); const char *rpc_proc_name(const struct rpc_task *task) { const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; if (proc) { if (proc->p_name) return proc->p_name; else return "NULL"; } else return "no proc"; } static void __rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status) { trace_rpc_call_rpcerror(task, tk_status, rpc_status); rpc_task_set_rpc_status(task, rpc_status); rpc_exit(task, tk_status); } static void rpc_call_rpcerror(struct rpc_task *task, int status) { __rpc_call_rpcerror(task, status, status); } /* * 0. Initial state * * Other FSM states can be visited zero or more times, but * this state is visited exactly once for each RPC. */ static void call_start(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; int idx = task->tk_msg.rpc_proc->p_statidx; trace_rpc_request(task); if (task->tk_client->cl_shutdown) { rpc_call_rpcerror(task, -EIO); return; } /* Increment call count (version might not be valid for ping) */ if (clnt->cl_program->version[clnt->cl_vers]) clnt->cl_program->version[clnt->cl_vers]->counts[idx]++; clnt->cl_stats->rpccnt++; task->tk_action = call_reserve; rpc_task_set_transport(task, clnt); } /* * 1. Reserve an RPC call slot */ static void call_reserve(struct rpc_task *task) { task->tk_status = 0; task->tk_action = call_reserveresult; xprt_reserve(task); } static void call_retry_reserve(struct rpc_task *task); /* * 1b. Grok the result of xprt_reserve() */ static void call_reserveresult(struct rpc_task *task) { int status = task->tk_status; /* * After a call to xprt_reserve(), we must have either * a request slot or else an error status. */ task->tk_status = 0; if (status >= 0) { if (task->tk_rqstp) { task->tk_action = call_refresh; return; } rpc_call_rpcerror(task, -EIO); return; } switch (status) { case -ENOMEM: rpc_delay(task, HZ >> 2); fallthrough; case -EAGAIN: /* woken up; retry */ task->tk_action = call_retry_reserve; return; default: rpc_call_rpcerror(task, status); } } /* * 1c. Retry reserving an RPC call slot */ static void call_retry_reserve(struct rpc_task *task) { task->tk_status = 0; task->tk_action = call_reserveresult; xprt_retry_reserve(task); } /* * 2. Bind and/or refresh the credentials */ static void call_refresh(struct rpc_task *task) { task->tk_action = call_refreshresult; task->tk_status = 0; task->tk_client->cl_stats->rpcauthrefresh++; rpcauth_refreshcred(task); } /* * 2a. Process the results of a credential refresh */ static void call_refreshresult(struct rpc_task *task) { int status = task->tk_status; task->tk_status = 0; task->tk_action = call_refresh; switch (status) { case 0: if (rpcauth_uptodatecred(task)) { task->tk_action = call_allocate; return; } /* Use rate-limiting and a max number of retries if refresh * had status 0 but failed to update the cred. */ fallthrough; case -ETIMEDOUT: rpc_delay(task, 3*HZ); fallthrough; case -EAGAIN: status = -EACCES; fallthrough; case -EKEYEXPIRED: if (!task->tk_cred_retry) break; task->tk_cred_retry--; trace_rpc_retry_refresh_status(task); return; case -ENOMEM: rpc_delay(task, HZ >> 4); return; } trace_rpc_refresh_status(task); rpc_call_rpcerror(task, status); } /* * 2b. Allocate the buffer. For details, see sched.c:rpc_malloc. * (Note: buffer memory is freed in xprt_release). */ static void call_allocate(struct rpc_task *task) { const struct rpc_auth *auth = task->tk_rqstp->rq_cred->cr_auth; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; int status; task->tk_status = 0; task->tk_action = call_encode; if (req->rq_buffer) return; if (proc->p_proc != 0) { BUG_ON(proc->p_arglen == 0); if (proc->p_decode != NULL) BUG_ON(proc->p_replen == 0); } /* * Calculate the size (in quads) of the RPC call * and reply headers, and convert both values * to byte sizes. */ req->rq_callsize = RPC_CALLHDRSIZE + (auth->au_cslack << 1) + proc->p_arglen; req->rq_callsize <<= 2; /* * Note: the reply buffer must at minimum allocate enough space * for the 'struct accepted_reply' from RFC5531. */ req->rq_rcvsize = RPC_REPHDRSIZE + auth->au_rslack + \ max_t(size_t, proc->p_replen, 2); req->rq_rcvsize <<= 2; status = xprt->ops->buf_alloc(task); trace_rpc_buf_alloc(task, status); if (status == 0) return; if (status != -ENOMEM) { rpc_call_rpcerror(task, status); return; } if (RPC_IS_ASYNC(task) || !fatal_signal_pending(current)) { task->tk_action = call_allocate; rpc_delay(task, HZ>>4); return; } rpc_call_rpcerror(task, -ERESTARTSYS); } static int rpc_task_need_encode(struct rpc_task *task) { return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 && (!(task->tk_flags & RPC_TASK_SENT) || !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) || xprt_request_need_retransmit(task)); } static void rpc_xdr_encode(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct xdr_stream xdr; xdr_buf_init(&req->rq_snd_buf, req->rq_buffer, req->rq_callsize); xdr_buf_init(&req->rq_rcv_buf, req->rq_rbuffer, req->rq_rcvsize); req->rq_reply_bytes_recvd = 0; req->rq_snd_buf.head[0].iov_len = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, req->rq_snd_buf.head[0].iov_base, req); if (rpc_encode_header(task, &xdr)) return; task->tk_status = rpcauth_wrap_req(task, &xdr); } /* * 3. Encode arguments of an RPC call */ static void call_encode(struct rpc_task *task) { if (!rpc_task_need_encode(task)) goto out; /* Dequeue task from the receive queue while we're encoding */ xprt_request_dequeue_xprt(task); /* Encode here so that rpcsec_gss can use correct sequence number. */ rpc_xdr_encode(task); /* Add task to reply queue before transmission to avoid races */ if (task->tk_status == 0 && rpc_reply_expected(task)) task->tk_status = xprt_request_enqueue_receive(task); /* Did the encode result in an error condition? */ if (task->tk_status != 0) { /* Was the error nonfatal? */ switch (task->tk_status) { case -EAGAIN: case -ENOMEM: rpc_delay(task, HZ >> 4); break; case -EKEYEXPIRED: if (!task->tk_cred_retry) { rpc_call_rpcerror(task, task->tk_status); } else { task->tk_action = call_refresh; task->tk_cred_retry--; trace_rpc_retry_refresh_status(task); } break; default: rpc_call_rpcerror(task, task->tk_status); } return; } xprt_request_enqueue_transmit(task); out: task->tk_action = call_transmit; /* Check that the connection is OK */ if (!xprt_bound(task->tk_xprt)) task->tk_action = call_bind; else if (!xprt_connected(task->tk_xprt)) task->tk_action = call_connect; } /* * Helpers to check if the task was already transmitted, and * to take action when that is the case. */ static bool rpc_task_transmitted(struct rpc_task *task) { return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); } static void rpc_task_handle_transmitted(struct rpc_task *task) { xprt_end_transmit(task); task->tk_action = call_transmit_status; } /* * 4. Get the server port number if not yet set */ static void call_bind(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } if (xprt_bound(xprt)) { task->tk_action = call_connect; return; } task->tk_action = call_bind_status; if (!xprt_prepare_transmit(task)) return; xprt->ops->rpcbind(task); } /* * 4a. Sort out bind result */ static void call_bind_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; int status = -EIO; if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } if (task->tk_status >= 0) goto out_next; if (xprt_bound(xprt)) { task->tk_status = 0; goto out_next; } switch (task->tk_status) { case -ENOMEM: rpc_delay(task, HZ >> 2); goto retry_timeout; case -EACCES: trace_rpcb_prog_unavail_err(task); /* fail immediately if this is an RPC ping */ if (task->tk_msg.rpc_proc->p_proc == 0) { status = -EOPNOTSUPP; break; } rpc_delay(task, 3*HZ); goto retry_timeout; case -ENOBUFS: rpc_delay(task, HZ >> 2); goto retry_timeout; case -EAGAIN: goto retry_timeout; case -ETIMEDOUT: trace_rpcb_timeout_err(task); goto retry_timeout; case -EPFNOSUPPORT: /* server doesn't support any rpcbind version we know of */ trace_rpcb_bind_version_err(task); break; case -EPROTONOSUPPORT: trace_rpcb_bind_version_err(task); goto retry_timeout; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPIPE: trace_rpcb_unreachable_err(task); if (!RPC_IS_SOFTCONN(task)) { rpc_delay(task, 5*HZ); goto retry_timeout; } status = task->tk_status; break; default: trace_rpcb_unrecognized_err(task); } rpc_call_rpcerror(task, status); return; out_next: task->tk_action = call_connect; return; retry_timeout: task->tk_status = 0; task->tk_action = call_bind; rpc_check_timeout(task); } /* * 4b. Connect to the RPC server */ static void call_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } if (xprt_connected(xprt)) { task->tk_action = call_transmit; return; } task->tk_action = call_connect_status; if (task->tk_status < 0) return; if (task->tk_flags & RPC_TASK_NOCONNECT) { rpc_call_rpcerror(task, -ENOTCONN); return; } if (!xprt_prepare_transmit(task)) return; xprt_connect(task); } /* * 4c. Sort out connect result */ static void call_connect_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } trace_rpc_connect_status(task); if (task->tk_status == 0) { clnt->cl_stats->netreconn++; goto out_next; } if (xprt_connected(xprt)) { task->tk_status = 0; goto out_next; } task->tk_status = 0; switch (status) { case -ECONNREFUSED: case -ECONNRESET: /* A positive refusal suggests a rebind is needed. */ if (RPC_IS_SOFTCONN(task)) break; if (clnt->cl_autobind) { rpc_force_rebind(clnt); goto out_retry; } fallthrough; case -ECONNABORTED: case -ENETDOWN: case -ENETUNREACH: case -EHOSTUNREACH: case -EPIPE: case -EPROTO: xprt_conditional_disconnect(task->tk_rqstp->rq_xprt, task->tk_rqstp->rq_connect_cookie); if (RPC_IS_SOFTCONN(task)) break; /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); fallthrough; case -EADDRINUSE: case -ENOTCONN: case -EAGAIN: case -ETIMEDOUT: if (!(task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) && (task->tk_flags & RPC_TASK_MOVEABLE) && test_bit(XPRT_REMOVE, &xprt->state)) { struct rpc_xprt *saved = task->tk_xprt; struct rpc_xprt_switch *xps; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); if (xps->xps_nxprts > 1) { long value; xprt_release(task); value = atomic_long_dec_return(&xprt->queuelen); if (value == 0) rpc_xprt_switch_remove_xprt(xps, saved, true); xprt_put(saved); task->tk_xprt = NULL; task->tk_action = call_start; } xprt_switch_put(xps); if (!task->tk_xprt) goto out; } goto out_retry; case -ENOBUFS: rpc_delay(task, HZ >> 2); goto out_retry; } rpc_call_rpcerror(task, status); return; out_next: task->tk_action = call_transmit; return; out_retry: /* Check for timeouts before looping back to call_bind */ task->tk_action = call_bind; out: rpc_check_timeout(task); } /* * 5. Transmit the RPC request, and wait for reply */ static void call_transmit(struct rpc_task *task) { if (rpc_task_transmitted(task)) { rpc_task_handle_transmitted(task); return; } task->tk_action = call_transmit_status; if (!xprt_prepare_transmit(task)) return; task->tk_status = 0; if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { if (!xprt_connected(task->tk_xprt)) { task->tk_status = -ENOTCONN; return; } xprt_transmit(task); } xprt_end_transmit(task); } /* * 5a. Handle cleanup after a transmission */ static void call_transmit_status(struct rpc_task *task) { task->tk_action = call_status; /* * Common case: success. Force the compiler to put this * test first. */ if (rpc_task_transmitted(task)) { task->tk_status = 0; xprt_request_wait_receive(task); return; } switch (task->tk_status) { default: break; case -EBADMSG: task->tk_status = 0; task->tk_action = call_encode; break; /* * Special cases: if we've been waiting on the * socket's write_space() callback, or if the * socket just returned a connection error, * then hold onto the transport lock. */ case -ENOMEM: case -ENOBUFS: rpc_delay(task, HZ>>2); fallthrough; case -EBADSLT: case -EAGAIN: task->tk_action = call_transmit; task->tk_status = 0; break; case -ECONNREFUSED: case -EHOSTDOWN: case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) { if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); rpc_call_rpcerror(task, task->tk_status); return; } fallthrough; case -ECONNRESET: case -ECONNABORTED: case -EADDRINUSE: case -ENOTCONN: case -EPIPE: task->tk_action = call_bind; task->tk_status = 0; break; } rpc_check_timeout(task); } #if defined(CONFIG_SUNRPC_BACKCHANNEL) static void call_bc_transmit(struct rpc_task *task); static void call_bc_transmit_status(struct rpc_task *task); static void call_bc_encode(struct rpc_task *task) { xprt_request_enqueue_transmit(task); task->tk_action = call_bc_transmit; } /* * 5b. Send the backchannel RPC reply. On error, drop the reply. In * addition, disconnect on connectivity errors. */ static void call_bc_transmit(struct rpc_task *task) { task->tk_action = call_bc_transmit_status; if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { if (!xprt_prepare_transmit(task)) return; task->tk_status = 0; xprt_transmit(task); } xprt_end_transmit(task); } static void call_bc_transmit_status(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; if (rpc_task_transmitted(task)) task->tk_status = 0; switch (task->tk_status) { case 0: /* Success */ case -ENETDOWN: case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -ECONNRESET: case -ECONNREFUSED: case -EADDRINUSE: case -ENOTCONN: case -EPIPE: break; case -ENOMEM: case -ENOBUFS: rpc_delay(task, HZ>>2); fallthrough; case -EBADSLT: case -EAGAIN: task->tk_status = 0; task->tk_action = call_bc_transmit; return; case -ETIMEDOUT: /* * Problem reaching the server. Disconnect and let the * forechannel reestablish the connection. The server will * have to retransmit the backchannel request and we'll * reprocess it. Since these ops are idempotent, there's no * need to cache our reply at this time. */ printk(KERN_NOTICE "RPC: Could not send backchannel reply " "error: %d\n", task->tk_status); xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); break; default: /* * We were unable to reply and will have to drop the * request. The server should reconnect and retransmit. */ printk(KERN_NOTICE "RPC: Could not send backchannel reply " "error: %d\n", task->tk_status); break; } task->tk_action = rpc_exit_task; } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /* * 6. Sort out the RPC call status */ static void call_status(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; int status; if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); status = task->tk_status; if (status >= 0) { task->tk_action = call_decode; return; } trace_rpc_call_status(task); task->tk_status = 0; switch(status) { case -EHOSTDOWN: case -ENETDOWN: case -EHOSTUNREACH: case -ENETUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) goto out_exit; /* * Delay any retries for 3 seconds, then handle as if it * were a timeout. */ rpc_delay(task, 3*HZ); fallthrough; case -ETIMEDOUT: break; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: rpc_force_rebind(clnt); break; case -EADDRINUSE: rpc_delay(task, 3*HZ); fallthrough; case -EPIPE: case -EAGAIN: break; case -ENFILE: case -ENOBUFS: case -ENOMEM: rpc_delay(task, HZ>>2); break; case -EIO: /* shutdown or soft timeout */ goto out_exit; default: if (clnt->cl_chatty) printk("%s: RPC call returned error %d\n", clnt->cl_program->name, -status); goto out_exit; } task->tk_action = call_encode; rpc_check_timeout(task); return; out_exit: rpc_call_rpcerror(task, status); } static bool rpc_check_connected(const struct rpc_rqst *req) { /* No allocated request or transport? return true */ if (!req || !req->rq_xprt) return true; return xprt_connected(req->rq_xprt); } static void rpc_check_timeout(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; if (RPC_SIGNALLED(task)) return; if (xprt_adjust_timeout(task->tk_rqstp) == 0) return; trace_rpc_timeout_status(task); task->tk_timeouts++; if (RPC_IS_SOFTCONN(task) && !rpc_check_connected(task->tk_rqstp)) { rpc_call_rpcerror(task, -ETIMEDOUT); return; } if (RPC_IS_SOFT(task)) { /* * Once a "no retrans timeout" soft tasks (a.k.a NFSv4) has * been sent, it should time out only if the transport * connection gets terminally broken. */ if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) && rpc_check_connected(task->tk_rqstp)) return; if (clnt->cl_chatty) { pr_notice_ratelimited( "%s: server %s not responding, timed out\n", clnt->cl_program->name, task->tk_xprt->servername); } if (task->tk_flags & RPC_TASK_TIMEOUT) rpc_call_rpcerror(task, -ETIMEDOUT); else __rpc_call_rpcerror(task, -EIO, -ETIMEDOUT); return; } if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) { task->tk_flags |= RPC_CALL_MAJORSEEN; if (clnt->cl_chatty) { pr_notice_ratelimited( "%s: server %s not responding, still trying\n", clnt->cl_program->name, task->tk_xprt->servername); } } rpc_force_rebind(clnt); /* * Did our request time out due to an RPCSEC_GSS out-of-sequence * event? RFC2203 requires the server to drop all such requests. */ rpcauth_invalcred(task); } /* * 7. Decode the RPC reply */ static void call_decode(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_stream xdr; int err; if (!task->tk_msg.rpc_proc->p_decode) { task->tk_action = rpc_exit_task; return; } if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { pr_notice_ratelimited("%s: server %s OK\n", clnt->cl_program->name, task->tk_xprt->servername); } task->tk_flags &= ~RPC_CALL_MAJORSEEN; } /* * Did we ever call xprt_complete_rqst()? If not, we should assume * the message is incomplete. */ err = -EAGAIN; if (!req->rq_reply_bytes_recvd) goto out; /* Ensure that we see all writes made by xprt_complete_rqst() * before it changed req->rq_reply_bytes_recvd. */ smp_rmb(); req->rq_rcv_buf.len = req->rq_private_buf.len; trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf); /* Check that the softirq receive buffer is valid */ WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, sizeof(req->rq_rcv_buf)) != 0); xdr_init_decode(&xdr, &req->rq_rcv_buf, req->rq_rcv_buf.head[0].iov_base, req); err = rpc_decode_header(task, &xdr); out: switch (err) { case 0: task->tk_action = rpc_exit_task; task->tk_status = rpcauth_unwrap_resp(task, &xdr); xdr_finish_decode(&xdr); return; case -EAGAIN: task->tk_status = 0; if (task->tk_client->cl_discrtry) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); task->tk_action = call_encode; rpc_check_timeout(task); break; case -EKEYREJECTED: task->tk_action = call_reserve; rpc_check_timeout(task); rpcauth_invalcred(task); /* Ensure we obtain a new XID if we retry! */ xprt_release(task); } } static int rpc_encode_header(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; __be32 *p; int error; error = -EMSGSIZE; p = xdr_reserve_space(xdr, RPC_CALLHDRSIZE << 2); if (!p) goto out_fail; *p++ = req->rq_xid; *p++ = rpc_call; *p++ = cpu_to_be32(RPC_VERSION); *p++ = cpu_to_be32(clnt->cl_prog); *p++ = cpu_to_be32(clnt->cl_vers); *p = cpu_to_be32(task->tk_msg.rpc_proc->p_proc); error = rpcauth_marshcred(task, xdr); if (error < 0) goto out_fail; return 0; out_fail: trace_rpc_bad_callhdr(task); rpc_call_rpcerror(task, error); return error; } static noinline int rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_clnt *clnt = task->tk_client; int error; __be32 *p; /* RFC-1014 says that the representation of XDR data must be a * multiple of four bytes * - if it isn't pointer subtraction in the NFS client may give * undefined results */ if (task->tk_rqstp->rq_rcv_buf.len & 3) goto out_unparsable; p = xdr_inline_decode(xdr, 3 * sizeof(*p)); if (!p) goto out_unparsable; p++; /* skip XID */ if (*p++ != rpc_reply) goto out_unparsable; if (*p++ != rpc_msg_accepted) goto out_msg_denied; error = rpcauth_checkverf(task, xdr); if (error) goto out_verifier; p = xdr_inline_decode(xdr, sizeof(*p)); if (!p) goto out_unparsable; switch (*p) { case rpc_success: return 0; case rpc_prog_unavail: trace_rpc__prog_unavail(task); error = -EPFNOSUPPORT; goto out_err; case rpc_prog_mismatch: trace_rpc__prog_mismatch(task); error = -EPROTONOSUPPORT; goto out_err; case rpc_proc_unavail: trace_rpc__proc_unavail(task); error = -EOPNOTSUPP; goto out_err; case rpc_garbage_args: case rpc_system_err: trace_rpc__garbage_args(task); error = -EIO; break; default: goto out_unparsable; } out_garbage: clnt->cl_stats->rpcgarbage++; if (task->tk_garb_retry) { task->tk_garb_retry--; task->tk_action = call_encode; return -EAGAIN; } out_err: rpc_call_rpcerror(task, error); return error; out_unparsable: trace_rpc__unparsable(task); error = -EIO; goto out_garbage; out_verifier: trace_rpc_bad_verifier(task); switch (error) { case -EPROTONOSUPPORT: goto out_err; case -EACCES: /* Re-encode with a fresh cred */ fallthrough; default: goto out_garbage; } out_msg_denied: error = -EACCES; p = xdr_inline_decode(xdr, sizeof(*p)); if (!p) goto out_unparsable; switch (*p++) { case rpc_auth_error: break; case rpc_mismatch: trace_rpc__mismatch(task); error = -EPROTONOSUPPORT; goto out_err; default: goto out_unparsable; } p = xdr_inline_decode(xdr, sizeof(*p)); if (!p) goto out_unparsable; switch (*p++) { case rpc_autherr_rejectedcred: case rpc_autherr_rejectedverf: case rpcsec_gsserr_credproblem: case rpcsec_gsserr_ctxproblem: rpcauth_invalcred(task); if (!task->tk_cred_retry) break; task->tk_cred_retry--; trace_rpc__stale_creds(task); return -EKEYREJECTED; case rpc_autherr_badcred: case rpc_autherr_badverf: /* possibly garbled cred/verf? */ if (!task->tk_garb_retry) break; task->tk_garb_retry--; trace_rpc__bad_creds(task); task->tk_action = call_encode; return -EAGAIN; case rpc_autherr_tooweak: trace_rpc__auth_tooweak(task); pr_warn("RPC: server %s requires stronger authentication.\n", task->tk_xprt->servername); break; default: goto out_unparsable; } goto out_err; } static void rpcproc_encode_null(struct rpc_rqst *rqstp, struct xdr_stream *xdr, const void *obj) { } static int rpcproc_decode_null(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *obj) { return 0; } static const struct rpc_procinfo rpcproc_null = { .p_encode = rpcproc_encode_null, .p_decode = rpcproc_decode_null, }; static const struct rpc_procinfo rpcproc_null_noreply = { .p_encode = rpcproc_encode_null, }; static void rpc_null_call_prepare(struct rpc_task *task, void *data) { task->tk_flags &= ~RPC_TASK_NO_RETRANS_TIMEOUT; rpc_call_start(task); } static const struct rpc_call_ops rpc_null_ops = { .rpc_call_prepare = rpc_null_call_prepare, .rpc_call_done = rpc_default_callback, }; static struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, const struct rpc_call_ops *ops, void *data) { struct rpc_message msg = { .rpc_proc = &rpcproc_null, }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, .rpc_xprt = xprt, .rpc_message = &msg, .rpc_op_cred = cred, .callback_ops = ops ?: &rpc_null_ops, .callback_data = data, .flags = flags | RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, }; return rpc_run_task(&task_setup_data); } struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags) { return rpc_call_null_helper(clnt, NULL, cred, flags, NULL, NULL); } EXPORT_SYMBOL_GPL(rpc_call_null); static int rpc_ping(struct rpc_clnt *clnt) { struct rpc_task *task; int status; if (clnt->cl_auth->au_ops->ping) return clnt->cl_auth->au_ops->ping(clnt); task = rpc_call_null_helper(clnt, NULL, NULL, 0, NULL, NULL); if (IS_ERR(task)) return PTR_ERR(task); status = task->tk_status; rpc_put_task(task); return status; } static int rpc_ping_noreply(struct rpc_clnt *clnt) { struct rpc_message msg = { .rpc_proc = &rpcproc_null_noreply, }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, .rpc_message = &msg, .callback_ops = &rpc_null_ops, .flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, }; struct rpc_task *task; int status; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); status = task->tk_status; rpc_put_task(task); return status; } struct rpc_cb_add_xprt_calldata { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; }; static void rpc_cb_add_xprt_done(struct rpc_task *task, void *calldata) { struct rpc_cb_add_xprt_calldata *data = calldata; if (task->tk_status == 0) rpc_xprt_switch_add_xprt(data->xps, data->xprt); } static void rpc_cb_add_xprt_release(void *calldata) { struct rpc_cb_add_xprt_calldata *data = calldata; xprt_put(data->xprt); xprt_switch_put(data->xps); kfree(data); } static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { .rpc_call_prepare = rpc_null_call_prepare, .rpc_call_done = rpc_cb_add_xprt_done, .rpc_release = rpc_cb_add_xprt_release, }; /** * rpc_clnt_test_and_add_xprt - Test and add a new transport to a rpc_clnt * @clnt: pointer to struct rpc_clnt * @xps: pointer to struct rpc_xprt_switch, * @xprt: pointer struct rpc_xprt * @in_max_connect: pointer to the max_connect value for the passed in xprt transport */ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, void *in_max_connect) { struct rpc_cb_add_xprt_calldata *data; struct rpc_task *task; int max_connect = clnt->cl_max_connect; if (in_max_connect) max_connect = *(int *)in_max_connect; if (xps->xps_nunique_destaddr_xprts + 1 > max_connect) { rcu_read_lock(); pr_warn("SUNRPC: reached max allowed number (%d) did not add " "transport to server: %s\n", max_connect, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); rcu_read_unlock(); return -EINVAL; } data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; data->xps = xprt_switch_get(xps); data->xprt = xprt_get(xprt); if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) { rpc_cb_add_xprt_release(data); goto success; } task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, &rpc_cb_add_xprt_call_ops, data); if (IS_ERR(task)) return PTR_ERR(task); data->xps->xps_nunique_destaddr_xprts++; rpc_put_task(task); success: return 1; } EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); static int rpc_clnt_add_xprt_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_add_xprt_test *data) { struct rpc_task *task; int status = -EADDRINUSE; /* Test the connection */ task = rpc_call_null_helper(clnt, xprt, NULL, 0, NULL, NULL); if (IS_ERR(task)) return PTR_ERR(task); status = task->tk_status; rpc_put_task(task); if (status < 0) return status; /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ data->add_xprt_test(clnt, xprt, data->data); return 0; } /** * rpc_clnt_setup_test_and_add_xprt() * * This is an rpc_clnt_add_xprt setup() function which returns 1 so: * 1) caller of the test function must dereference the rpc_xprt_switch * and the rpc_xprt. * 2) test function must call rpc_xprt_switch_add_xprt, usually in * the rpc_call_done routine. * * Upon success (return of 1), the test function adds the new * transport to the rpc_clnt xprt switch * * @clnt: struct rpc_clnt to get the new transport * @xps: the rpc_xprt_switch to hold the new transport * @xprt: the rpc_xprt to test * @data: a struct rpc_add_xprt_test pointer that holds the test function * and test function call data */ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps, struct rpc_xprt *xprt, void *data) { int status = -EADDRINUSE; xprt = xprt_get(xprt); xprt_switch_get(xps); if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) goto out_err; status = rpc_clnt_add_xprt_helper(clnt, xprt, data); if (status < 0) goto out_err; status = 1; out_err: xprt_put(xprt); xprt_switch_put(xps); if (status < 0) pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not " "added\n", status, xprt->address_strings[RPC_DISPLAY_ADDR]); /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ return status; } EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt); /** * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt * @clnt: pointer to struct rpc_clnt * @xprtargs: pointer to struct xprt_create * @setup: callback to test and/or set up the connection * @data: pointer to setup function data * * Creates a new transport using the parameters set in args and * adds it to clnt. * If ping is set, then test that connectivity succeeds before * adding the new transport. * */ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, struct xprt_create *xprtargs, int (*setup)(struct rpc_clnt *, struct rpc_xprt_switch *, struct rpc_xprt *, void *), void *data) { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; unsigned long connect_timeout; unsigned long reconnect_timeout; unsigned char resvport, reuseport; int ret = 0, ident; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); xprt = xprt_iter_xprt(&clnt->cl_xpi); if (xps == NULL || xprt == NULL) { rcu_read_unlock(); xprt_switch_put(xps); return -EAGAIN; } resvport = xprt->resvport; reuseport = xprt->reuseport; connect_timeout = xprt->connect_timeout; reconnect_timeout = xprt->max_reconnect_timeout; ident = xprt->xprt_class->ident; rcu_read_unlock(); if (!xprtargs->ident) xprtargs->ident = ident; xprtargs->xprtsec = clnt->cl_xprtsec; xprt = xprt_create_transport(xprtargs); if (IS_ERR(xprt)) { ret = PTR_ERR(xprt); goto out_put_switch; } xprt->resvport = resvport; xprt->reuseport = reuseport; if (xprtargs->connect_timeout) connect_timeout = xprtargs->connect_timeout; if (xprtargs->reconnect_timeout) reconnect_timeout = xprtargs->reconnect_timeout; if (xprt->ops->set_connect_timeout != NULL) xprt->ops->set_connect_timeout(xprt, connect_timeout, reconnect_timeout); rpc_xprt_switch_set_roundrobin(xps); if (setup) { ret = setup(clnt, xps, xprt, data); if (ret != 0) goto out_put_xprt; } rpc_xprt_switch_add_xprt(xps, xprt); out_put_xprt: xprt_put(xprt); out_put_switch: xprt_switch_put(xps); return ret; } EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_add_xprt_test *data) { struct rpc_xprt_switch *xps; struct rpc_xprt *main_xprt; int status = 0; xprt_get(xprt); rcu_read_lock(); main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); status = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr, (struct sockaddr *)&main_xprt->addr); rcu_read_unlock(); xprt_put(main_xprt); if (status || !test_bit(XPRT_OFFLINE, &xprt->state)) goto out; status = rpc_clnt_add_xprt_helper(clnt, xprt, data); out: xprt_put(xprt); xprt_switch_put(xps); return status; } /* rpc_clnt_probe_trunked_xprt -- probe offlined transport for session trunking * @clnt rpc_clnt structure * * For each offlined transport found in the rpc_clnt structure call * the function rpc_xprt_probe_trunked() which will determine if this * transport still belongs to the trunking group. */ void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *clnt, struct rpc_add_xprt_test *data) { struct rpc_xprt_iter xpi; int ret; ret = rpc_clnt_xprt_iter_offline_init(clnt, &xpi); if (ret) return; for (;;) { struct rpc_xprt *xprt = xprt_iter_get_next(&xpi); if (!xprt) break; ret = rpc_xprt_probe_trunked(clnt, xprt, data); xprt_put(xprt); if (ret < 0) break; xprt_iter_rewind(&xpi); } xprt_iter_destroy(&xpi); } EXPORT_SYMBOL_GPL(rpc_clnt_probe_trunked_xprts); static int rpc_xprt_offline(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { struct rpc_xprt *main_xprt; struct rpc_xprt_switch *xps; int err = 0; xprt_get(xprt); rcu_read_lock(); main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); err = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr, (struct sockaddr *)&main_xprt->addr); rcu_read_unlock(); xprt_put(main_xprt); if (err) goto out; if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { err = -EINTR; goto out; } xprt_set_offline_locked(xprt, xps); xprt_release_write(xprt, NULL); out: xprt_put(xprt); xprt_switch_put(xps); return err; } /* rpc_clnt_manage_trunked_xprts -- offline trunked transports * @clnt rpc_clnt structure * * For each active transport found in the rpc_clnt structure call * the function rpc_xprt_offline() which will identify trunked transports * and will mark them offline. */ void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *clnt) { rpc_clnt_iterate_for_each_xprt(clnt, rpc_xprt_offline, NULL); } EXPORT_SYMBOL_GPL(rpc_clnt_manage_trunked_xprts); struct connect_timeout_data { unsigned long connect_timeout; unsigned long reconnect_timeout; }; static int rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { struct connect_timeout_data *timeo = data; if (xprt->ops->set_connect_timeout) xprt->ops->set_connect_timeout(xprt, timeo->connect_timeout, timeo->reconnect_timeout); return 0; } void rpc_set_connect_timeout(struct rpc_clnt *clnt, unsigned long connect_timeout, unsigned long reconnect_timeout) { struct connect_timeout_data timeout = { .connect_timeout = connect_timeout, .reconnect_timeout = reconnect_timeout, }; rpc_clnt_iterate_for_each_xprt(clnt, rpc_xprt_set_connect_timeout, &timeout); } EXPORT_SYMBOL_GPL(rpc_set_connect_timeout); void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) { rcu_read_lock(); xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; rcu_read_lock(); xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); rcu_read_unlock(); xprt_set_online_locked(xprt, xps); } void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { if (rpc_clnt_xprt_switch_has_addr(clnt, (const struct sockaddr *)&xprt->addr)) { return rpc_clnt_xprt_set_online(clnt, xprt); } rcu_read_lock(); rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), xprt); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { struct rpc_xprt_switch *xps; rcu_read_lock(); xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); rpc_xprt_switch_remove_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), xprt, 0); xps->xps_nunique_destaddr_xprts--; rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_remove_xprt); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap) { struct rpc_xprt_switch *xps; bool ret; rcu_read_lock(); xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); ret = rpc_xprt_switch_has_addr(xps, sap); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { printk(KERN_INFO "-pid- flgs status -client- --rqstp- " "-timeout ---ops--\n"); } static void rpc_show_task(const struct rpc_clnt *clnt, const struct rpc_task *task) { const char *rpc_waitq = "none"; if (RPC_IS_QUEUED(task)) rpc_waitq = rpc_qname(task->tk_waitqueue); printk(KERN_INFO "%5u %04x %6d %8p %8p %8ld %8p %sv%u %s a:%ps q:%s\n", task->tk_pid, task->tk_flags, task->tk_status, clnt, task->tk_rqstp, rpc_task_timeout(task), task->tk_ops, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), task->tk_action, rpc_waitq); } void rpc_show_tasks(struct net *net) { struct rpc_clnt *clnt; struct rpc_task *task; int header = 0; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); spin_lock(&sn->rpc_client_lock); list_for_each_entry(clnt, &sn->all_clients, cl_clients) { spin_lock(&clnt->cl_lock); list_for_each_entry(task, &clnt->cl_tasks, tk_task) { if (!header) { rpc_show_header(); header++; } rpc_show_task(clnt, task); } spin_unlock(&clnt->cl_lock); } spin_unlock(&sn->rpc_client_lock); } #endif #if IS_ENABLED(CONFIG_SUNRPC_SWAP) static int rpc_clnt_swap_activate_callback(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *dummy) { return xprt_enable_swap(xprt); } int rpc_clnt_swap_activate(struct rpc_clnt *clnt) { while (clnt != clnt->cl_parent) clnt = clnt->cl_parent; if (atomic_inc_return(&clnt->cl_swapper) == 1) return rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_swap_activate_callback, NULL); return 0; } EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate); static int rpc_clnt_swap_deactivate_callback(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *dummy) { xprt_disable_swap(xprt); return 0; } void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt) { while (clnt != clnt->cl_parent) clnt = clnt->cl_parent; if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_swap_deactivate_callback, NULL); } EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate); #endif /* CONFIG_SUNRPC_SWAP */
3 3 2 1 1 3 3 3 3 3 3 1 5 1 2 2 1 2 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_sfb.c Stochastic Fair Blue * * Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr> * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com> * * W. Feng, D. Kandlur, D. Saha, K. Shin. Blue: * A New Class of Active Queue Management Algorithms. * U. Michigan CSE-TR-387-99, April 1999. * * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/siphash.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/inet_ecn.h> /* * SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level) * This implementation uses L = 8 and N = 16 * This permits us to split one 32bit hash (provided per packet by rxhash or * external classifier) into 8 subhashes of 4 bits. */ #define SFB_BUCKET_SHIFT 4 #define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */ #define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1) #define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */ /* SFB algo uses a virtual queue, named "bin" */ struct sfb_bucket { u16 qlen; /* length of virtual queue */ u16 p_mark; /* marking probability */ }; /* We use a double buffering right before hash change * (Section 4.4 of SFB reference : moving hash functions) */ struct sfb_bins { siphash_key_t perturbation; /* siphash key */ struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS]; }; struct sfb_sched_data { struct Qdisc *qdisc; struct tcf_proto __rcu *filter_list; struct tcf_block *block; unsigned long rehash_interval; unsigned long warmup_time; /* double buffering warmup time in jiffies */ u32 max; u32 bin_size; /* maximum queue length per bin */ u32 increment; /* d1 */ u32 decrement; /* d2 */ u32 limit; /* HARD maximal queue length */ u32 penalty_rate; u32 penalty_burst; u32 tokens_avail; unsigned long rehash_time; unsigned long token_time; u8 slot; /* current active bins (0 or 1) */ bool double_buffering; struct sfb_bins bins[2]; struct { u32 earlydrop; u32 penaltydrop; u32 bucketdrop; u32 queuedrop; u32 childdrop; /* drops in child qdisc */ u32 marked; /* ECN mark */ } stats; }; /* * Each queued skb might be hashed on one or two bins * We store in skb_cb the two hash values. * (A zero value means double buffering was not used) */ struct sfb_skb_cb { u32 hashes[2]; }; static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) { qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; } /* * If using 'internal' SFB flow classifier, hash comes from skb rxhash * If using external classifier, hash comes from the classid. */ static u32 sfb_hash(const struct sk_buff *skb, u32 slot) { return sfb_skb_cb(skb)->hashes[slot]; } /* Probabilities are coded as Q0.16 fixed-point values, * with 0xFFFF representing 65535/65536 (almost 1.0) * Addition and subtraction are saturating in [0, 65535] */ static u32 prob_plus(u32 p1, u32 p2) { u32 res = p1 + p2; return min_t(u32, res, SFB_MAX_PROB); } static u32 prob_minus(u32 p1, u32 p2) { return p1 > p2 ? p1 - p2 : 0; } static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) { int i; struct sfb_bucket *b = &q->bins[slot].bins[0][0]; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; sfbhash >>= SFB_BUCKET_SHIFT; if (b[hash].qlen < 0xFFFF) b[hash].qlen++; b += SFB_NUMBUCKETS; /* next level */ } } static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash; sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q); sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } static void decrement_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) { int i; struct sfb_bucket *b = &q->bins[slot].bins[0][0]; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; sfbhash >>= SFB_BUCKET_SHIFT; if (b[hash].qlen > 0) b[hash].qlen--; b += SFB_NUMBUCKETS; /* next level */ } } static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) { u32 sfbhash; sfbhash = sfb_hash(skb, 0); if (sfbhash) decrement_one_qlen(sfbhash, 0, q); sfbhash = sfb_hash(skb, 1); if (sfbhash) decrement_one_qlen(sfbhash, 1, q); } static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q) { b->p_mark = prob_minus(b->p_mark, q->decrement); } static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q) { b->p_mark = prob_plus(b->p_mark, q->increment); } static void sfb_zero_all_buckets(struct sfb_sched_data *q) { memset(&q->bins, 0, sizeof(q->bins)); } /* * compute max qlen, max p_mark, and avg p_mark */ static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q) { int i; u32 qlen = 0, prob = 0, totalpm = 0; const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0]; for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) { if (qlen < b->qlen) qlen = b->qlen; totalpm += b->p_mark; if (prob < b->p_mark) prob = b->p_mark; b++; } *prob_r = prob; *avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS); return qlen; } static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q) { get_random_bytes(&q->bins[slot].perturbation, sizeof(q->bins[slot].perturbation)); } static void sfb_swap_slot(struct sfb_sched_data *q) { sfb_init_perturbation(q->slot, q); q->slot ^= 1; q->double_buffering = false; } /* Non elastic flows are allowed to use part of the bandwidth, expressed * in "penalty_rate" packets per second, with "penalty_burst" burst */ static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q) { if (q->penalty_rate == 0 || q->penalty_burst == 0) return true; if (q->tokens_avail < 1) { unsigned long age = min(10UL * HZ, jiffies - q->token_time); q->tokens_avail = (age * q->penalty_rate) / HZ; if (q->tokens_avail > q->penalty_burst) q->tokens_avail = q->penalty_burst; q->token_time = jiffies; if (q->tokens_avail < 1) return true; } q->tokens_avail--; return false; } static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, int *qerr, u32 *salt) { struct tcf_result res; int result; result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return false; } #endif *salt = TC_H_MIN(res.classid); return true; } return false; } static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sfb_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; u32 r, sfbhash; u32 slot = q->slot; int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (unlikely(sch->q.qlen >= q->limit)) { qdisc_qstats_overlimit(sch); q->stats.queuedrop++; goto drop; } if (q->rehash_interval > 0) { unsigned long limit = q->rehash_time + q->rehash_interval; if (unlikely(time_after(jiffies, limit))) { sfb_swap_slot(q); q->rehash_time = jiffies; } else if (unlikely(!q->double_buffering && q->warmup_time > 0 && time_after(jiffies, limit - q->warmup_time))) { q->double_buffering = true; } } fl = rcu_dereference_bh(q->filter_list); if (fl) { u32 salt; /* If using external classifiers, get result and record it. */ if (!sfb_classify(skb, fl, &ret, &salt)) goto other_drop; sfbhash = siphash_1u32(salt, &q->bins[slot].perturbation); } else { sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation); } if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; sfbhash >>= SFB_BUCKET_SHIFT; if (b->qlen == 0) decrement_prob(b, q); else if (b->qlen >= q->bin_size) increment_prob(b, q); if (minqlen > b->qlen) minqlen = b->qlen; if (p_min > b->p_mark) p_min = b->p_mark; } slot ^= 1; sfb_skb_cb(skb)->hashes[slot] = 0; if (unlikely(minqlen >= q->max)) { qdisc_qstats_overlimit(sch); q->stats.bucketdrop++; goto drop; } if (unlikely(p_min >= SFB_MAX_PROB)) { /* Inelastic flow */ if (q->double_buffering) { sfbhash = skb_get_hash_perturb(skb, &q->bins[slot].perturbation); if (!sfbhash) sfbhash = 1; sfb_skb_cb(skb)->hashes[slot] = sfbhash; for (i = 0; i < SFB_LEVELS; i++) { u32 hash = sfbhash & SFB_BUCKET_MASK; struct sfb_bucket *b = &q->bins[slot].bins[i][hash]; sfbhash >>= SFB_BUCKET_SHIFT; if (b->qlen == 0) decrement_prob(b, q); else if (b->qlen >= q->bin_size) increment_prob(b, q); } } if (sfb_rate_limit(skb, q)) { qdisc_qstats_overlimit(sch); q->stats.penaltydrop++; goto drop; } goto enqueue; } r = get_random_u16() & SFB_MAX_PROB; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { /* If we're marking that many packets, then either * this flow is unresponsive, or we're badly congested. * In either case, we want to start dropping packets. */ if (r < (p_min - SFB_MAX_PROB / 2) * 2) { q->stats.earlydrop++; goto drop; } } if (INET_ECN_set_ce(skb)) { q->stats.marked++; } else { q->stats.earlydrop++; goto drop; } } enqueue: memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { sch->qstats.backlog += len; sch->q.qlen++; increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch); } return ret; drop: qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); kfree_skb(skb); return ret; } static struct sk_buff *sfb_dequeue(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; struct sk_buff *skb; skb = child->dequeue(q->qdisc); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; decrement_qlen(skb, q); } return skb; } static struct sk_buff *sfb_peek(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; return child->ops->peek(child); } /* No sfb_drop -- impossible since the child doesn't return the dropped skb. */ static void sfb_reset(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); if (likely(q->qdisc)) qdisc_reset(q->qdisc); q->slot = 0; q->double_buffering = false; sfb_zero_all_buckets(q); sfb_init_perturbation(0, q); } static void sfb_destroy(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); qdisc_put(q->qdisc); } static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = { [TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) }, }; static const struct tc_sfb_qopt sfb_default_ops = { .rehash_interval = 600 * MSEC_PER_SEC, .warmup_time = 60 * MSEC_PER_SEC, .limit = 0, .max = 25, .bin_size = 20, .increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */ .decrement = (SFB_MAX_PROB + 3000) / 6000, .penalty_rate = 10, .penalty_burst = 20, }; static int sfb_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child, *old; struct nlattr *tb[TCA_SFB_MAX + 1]; const struct tc_sfb_qopt *ctl = &sfb_default_ops; u32 limit; int err; if (opt) { err = nla_parse_nested_deprecated(tb, TCA_SFB_MAX, opt, sfb_policy, NULL); if (err < 0) return -EINVAL; if (tb[TCA_SFB_PARMS] == NULL) return -EINVAL; ctl = nla_data(tb[TCA_SFB_PARMS]); } limit = ctl->limit; if (limit == 0) limit = qdisc_dev(sch)->tx_queue_len; child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit, extack); if (IS_ERR(child)) return PTR_ERR(child); if (child != &noop_qdisc) qdisc_hash_add(child, true); sch_tree_lock(sch); qdisc_purge_queue(q->qdisc); old = q->qdisc; q->qdisc = child; q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval); q->warmup_time = msecs_to_jiffies(ctl->warmup_time); q->rehash_time = jiffies; q->limit = limit; q->increment = ctl->increment; q->decrement = ctl->decrement; q->max = ctl->max; q->bin_size = ctl->bin_size; q->penalty_rate = ctl->penalty_rate; q->penalty_burst = ctl->penalty_burst; q->tokens_avail = ctl->penalty_burst; q->token_time = jiffies; q->slot = 0; q->double_buffering = false; sfb_zero_all_buckets(q); sfb_init_perturbation(0, q); sfb_init_perturbation(1, q); sch_tree_unlock(sch); qdisc_put(old); return 0; } static int sfb_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); int err; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; q->qdisc = &noop_qdisc; return sfb_change(sch, opt, extack); } static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfb_sched_data *q = qdisc_priv(sch); struct nlattr *opts; struct tc_sfb_qopt opt = { .rehash_interval = jiffies_to_msecs(q->rehash_interval), .warmup_time = jiffies_to_msecs(q->warmup_time), .limit = q->limit, .max = q->max, .bin_size = q->bin_size, .increment = q->increment, .decrement = q->decrement, .penalty_rate = q->penalty_rate, .penalty_burst = q->penalty_burst, }; sch->qstats.backlog = q->qdisc->qstats.backlog; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt)) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -EMSGSIZE; } static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct sfb_sched_data *q = qdisc_priv(sch); struct tc_sfb_xstats st = { .earlydrop = q->stats.earlydrop, .penaltydrop = q->stats.penaltydrop, .bucketdrop = q->stats.bucketdrop, .queuedrop = q->stats.queuedrop, .childdrop = q->stats.childdrop, .marked = q->stats.marked, }; st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q); return gnet_stats_copy_app(d, &st, sizeof(st)); } static int sfb_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { return -ENOSYS; } static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); return 0; } static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg) { struct sfb_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long sfb_find(struct Qdisc *sch, u32 classid) { return 1; } static void sfb_unbind(struct Qdisc *sch, unsigned long arg) { } static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { return -ENOSYS; } static int sfb_delete(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { return -ENOSYS; } static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { tc_qdisc_stats_dump(sch, 1, walker); } } static struct tcf_block *sfb_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct sfb_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static const struct Qdisc_class_ops sfb_class_ops = { .graft = sfb_graft, .leaf = sfb_leaf, .find = sfb_find, .change = sfb_change_class, .delete = sfb_delete, .walk = sfb_walk, .tcf_block = sfb_tcf_block, .bind_tcf = sfb_bind, .unbind_tcf = sfb_unbind, .dump = sfb_dump_class, }; static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { .id = "sfb", .priv_size = sizeof(struct sfb_sched_data), .cl_ops = &sfb_class_ops, .enqueue = sfb_enqueue, .dequeue = sfb_dequeue, .peek = sfb_peek, .init = sfb_init, .reset = sfb_reset, .destroy = sfb_destroy, .change = sfb_change, .dump = sfb_dump, .dump_stats = sfb_dump_stats, .owner = THIS_MODULE, }; static int __init sfb_module_init(void) { return register_qdisc(&sfb_qdisc_ops); } static void __exit sfb_module_exit(void) { unregister_qdisc(&sfb_qdisc_ops); } module_init(sfb_module_init) module_exit(sfb_module_exit) MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline"); MODULE_AUTHOR("Juliusz Chroboczek"); MODULE_AUTHOR("Eric Dumazet"); MODULE_LICENSE("GPL");
30 95 8 172 172 48 11 42 119 111 15 3 1 2 68 13 13 55 19 5 2 1 11 18 53 53 37 16 3 2 48 13 1 12 4 17 2 32 6 59 13 30 17 6 6 1 16 10 21 6 25 4 1 3 93 51 42 7 6 8 16 3 2 4 6 7 4 4 4 5 3 1 2 2 6 4 14 3 1 1 104 1 1 5 1 3 4 2 14 75 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines * which can be dynamically activated and de-activated by the line * discipline handling modules (like SLIP). */ #include <linux/bits.h> #include <linux/types.h> #include <linux/termios.h> #include <linux/errno.h> #include <linux/sched/signal.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/tty.h> #include <linux/fcntl.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/compat.h> #include <linux/termios_internal.h> #include "tty.h" #include <asm/io.h> #include <linux/uaccess.h> #undef DEBUG /* * Internal flag options for termios setting behavior */ #define TERMIOS_FLUSH BIT(0) #define TERMIOS_WAIT BIT(1) #define TERMIOS_TERMIO BIT(2) #define TERMIOS_OLD BIT(3) /** * tty_chars_in_buffer - characters pending * @tty: terminal * * Returns: the number of bytes of data in the device private output queue. If * no private method is supplied there is assumed to be no queue on the device. */ unsigned int tty_chars_in_buffer(struct tty_struct *tty) { if (tty->ops->chars_in_buffer) return tty->ops->chars_in_buffer(tty); return 0; } EXPORT_SYMBOL(tty_chars_in_buffer); /** * tty_write_room - write queue space * @tty: terminal * * Returns: the number of bytes that can be queued to this device at the present * time. The result should be treated as a guarantee and the driver cannot * offer a value it later shrinks by more than the number of bytes written. If * no method is provided, 2K is always returned and data may be lost as there * will be no flow control. */ unsigned int tty_write_room(struct tty_struct *tty) { if (tty->ops->write_room) return tty->ops->write_room(tty); return 2048; } EXPORT_SYMBOL(tty_write_room); /** * tty_driver_flush_buffer - discard internal buffer * @tty: terminal * * Discard the internal output buffer for this device. If no method is provided, * then either the buffer cannot be hardware flushed or there is no buffer * driver side. */ void tty_driver_flush_buffer(struct tty_struct *tty) { if (tty->ops->flush_buffer) tty->ops->flush_buffer(tty); } EXPORT_SYMBOL(tty_driver_flush_buffer); /** * tty_unthrottle - flow control * @tty: terminal * * Indicate that a @tty may continue transmitting data down the stack. Takes * the &tty_struct->termios_rwsem to protect against parallel * throttle/unthrottle and also to ensure the driver can consistently reference * its own termios data at this point when implementing software flow control. * * Drivers should however remember that the stack can issue a throttle, then * change flow control method, then unthrottle. */ void tty_unthrottle(struct tty_struct *tty) { down_write(&tty->termios_rwsem); if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) && tty->ops->unthrottle) tty->ops->unthrottle(tty); tty->flow_change = TTY_FLOW_NO_CHANGE; up_write(&tty->termios_rwsem); } EXPORT_SYMBOL(tty_unthrottle); /** * tty_throttle_safe - flow control * @tty: terminal * * Indicate that a @tty should stop transmitting data down the stack. * tty_throttle_safe() will only attempt throttle if @tty->flow_change is * %TTY_THROTTLE_SAFE. Prevents an accidental throttle due to race conditions * when throttling is conditional on factors evaluated prior to throttling. * * Returns: %true if @tty is throttled (or was already throttled) */ bool tty_throttle_safe(struct tty_struct *tty) { bool ret = true; mutex_lock(&tty->throttle_mutex); if (!tty_throttled(tty)) { if (tty->flow_change != TTY_THROTTLE_SAFE) ret = false; else { set_bit(TTY_THROTTLED, &tty->flags); if (tty->ops->throttle) tty->ops->throttle(tty); } } mutex_unlock(&tty->throttle_mutex); return ret; } /** * tty_unthrottle_safe - flow control * @tty: terminal * * Similar to tty_unthrottle() but will only attempt unthrottle if * @tty->flow_change is %TTY_UNTHROTTLE_SAFE. Prevents an accidental unthrottle * due to race conditions when unthrottling is conditional on factors evaluated * prior to unthrottling. * * Returns: %true if @tty is unthrottled (or was already unthrottled) */ bool tty_unthrottle_safe(struct tty_struct *tty) { bool ret = true; mutex_lock(&tty->throttle_mutex); if (tty_throttled(tty)) { if (tty->flow_change != TTY_UNTHROTTLE_SAFE) ret = false; else { clear_bit(TTY_THROTTLED, &tty->flags); if (tty->ops->unthrottle) tty->ops->unthrottle(tty); } } mutex_unlock(&tty->throttle_mutex); return ret; } /** * tty_wait_until_sent - wait for I/O to finish * @tty: tty we are waiting for * @timeout: how long we will wait * * Wait for characters pending in a tty driver to hit the wire, or for a * timeout to occur (eg due to flow control). * * Locking: none */ void tty_wait_until_sent(struct tty_struct *tty, long timeout) { if (!timeout) timeout = MAX_SCHEDULE_TIMEOUT; timeout = wait_event_interruptible_timeout(tty->write_wait, !tty_chars_in_buffer(tty), timeout); if (timeout <= 0) return; if (timeout == MAX_SCHEDULE_TIMEOUT) timeout = 0; if (tty->ops->wait_until_sent) tty->ops->wait_until_sent(tty, timeout); } EXPORT_SYMBOL(tty_wait_until_sent); /* * Termios Helper Methods */ static void unset_locked_termios(struct tty_struct *tty, const struct ktermios *old) { struct ktermios *termios = &tty->termios; struct ktermios *locked = &tty->termios_locked; int i; #define NOSET_MASK(x, y, z) (x = ((x) & ~(z)) | ((y) & (z))) NOSET_MASK(termios->c_iflag, old->c_iflag, locked->c_iflag); NOSET_MASK(termios->c_oflag, old->c_oflag, locked->c_oflag); NOSET_MASK(termios->c_cflag, old->c_cflag, locked->c_cflag); NOSET_MASK(termios->c_lflag, old->c_lflag, locked->c_lflag); termios->c_line = locked->c_line ? old->c_line : termios->c_line; for (i = 0; i < NCCS; i++) termios->c_cc[i] = locked->c_cc[i] ? old->c_cc[i] : termios->c_cc[i]; /* FIXME: What should we do for i/ospeed */ } /** * tty_termios_copy_hw - copy hardware settings * @new: new termios * @old: old termios * * Propagate the hardware specific terminal setting bits from the @old termios * structure to the @new one. This is used in cases where the hardware does not * support reconfiguration or as a helper in some cases where only minimal * reconfiguration is supported. */ void tty_termios_copy_hw(struct ktermios *new, const struct ktermios *old) { /* The bits a dumb device handles in software. Smart devices need to always provide a set_termios method */ new->c_cflag &= HUPCL | CREAD | CLOCAL; new->c_cflag |= old->c_cflag & ~(HUPCL | CREAD | CLOCAL); new->c_ispeed = old->c_ispeed; new->c_ospeed = old->c_ospeed; } EXPORT_SYMBOL(tty_termios_copy_hw); /** * tty_termios_hw_change - check for setting change * @a: termios * @b: termios to compare * * Check if any of the bits that affect a dumb device have changed between the * two termios structures, or a speed change is needed. * * Returns: %true if change is needed */ bool tty_termios_hw_change(const struct ktermios *a, const struct ktermios *b) { if (a->c_ispeed != b->c_ispeed || a->c_ospeed != b->c_ospeed) return true; if ((a->c_cflag ^ b->c_cflag) & ~(HUPCL | CREAD | CLOCAL)) return true; return false; } EXPORT_SYMBOL(tty_termios_hw_change); /** * tty_get_char_size - get size of a character * @cflag: termios cflag value * * Returns: size (in bits) of a character depending on @cflag's %CSIZE setting */ unsigned char tty_get_char_size(unsigned int cflag) { switch (cflag & CSIZE) { case CS5: return 5; case CS6: return 6; case CS7: return 7; case CS8: default: return 8; } } EXPORT_SYMBOL_GPL(tty_get_char_size); /** * tty_get_frame_size - get size of a frame * @cflag: termios cflag value * * Get the size (in bits) of a frame depending on @cflag's %CSIZE, %CSTOPB, and * %PARENB setting. The result is a sum of character size, start and stop bits * -- one bit each -- second stop bit (if set), and parity bit (if set). * * Returns: size (in bits) of a frame depending on @cflag's setting. */ unsigned char tty_get_frame_size(unsigned int cflag) { unsigned char bits = 2 + tty_get_char_size(cflag); if (cflag & CSTOPB) bits++; if (cflag & PARENB) bits++; if (cflag & ADDRB) bits++; return bits; } EXPORT_SYMBOL_GPL(tty_get_frame_size); /** * tty_set_termios - update termios values * @tty: tty to update * @new_termios: desired new value * * Perform updates to the termios values set on this @tty. A master pty's * termios should never be set. * * Locking: &tty_struct->termios_rwsem */ int tty_set_termios(struct tty_struct *tty, struct ktermios *new_termios) { struct ktermios old_termios; struct tty_ldisc *ld; WARN_ON(tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER); /* * Perform the actual termios internal changes under lock. */ /* FIXME: we need to decide on some locking/ordering semantics for the set_termios notification eventually */ down_write(&tty->termios_rwsem); old_termios = tty->termios; tty->termios = *new_termios; unset_locked_termios(tty, &old_termios); /* Reset any ADDRB changes, ADDRB is changed through ->rs485_config() */ tty->termios.c_cflag ^= (tty->termios.c_cflag ^ old_termios.c_cflag) & ADDRB; if (tty->ops->set_termios) tty->ops->set_termios(tty, &old_termios); else tty_termios_copy_hw(&tty->termios, &old_termios); ld = tty_ldisc_ref(tty); if (ld != NULL) { if (ld->ops->set_termios) ld->ops->set_termios(tty, &old_termios); tty_ldisc_deref(ld); } up_write(&tty->termios_rwsem); return 0; } EXPORT_SYMBOL_GPL(tty_set_termios); /* * Translate a "termio" structure into a "termios". Ugh. */ __weak int user_termio_to_kernel_termios(struct ktermios *termios, struct termio __user *termio) { struct termio v; if (copy_from_user(&v, termio, sizeof(struct termio))) return -EFAULT; termios->c_iflag = (0xffff0000 & termios->c_iflag) | v.c_iflag; termios->c_oflag = (0xffff0000 & termios->c_oflag) | v.c_oflag; termios->c_cflag = (0xffff0000 & termios->c_cflag) | v.c_cflag; termios->c_lflag = (0xffff0000 & termios->c_lflag) | v.c_lflag; termios->c_line = (0xffff0000 & termios->c_lflag) | v.c_line; memcpy(termios->c_cc, v.c_cc, NCC); return 0; } /* * Translate a "termios" structure into a "termio". Ugh. */ __weak int kernel_termios_to_user_termio(struct termio __user *termio, struct ktermios *termios) { struct termio v; memset(&v, 0, sizeof(struct termio)); v.c_iflag = termios->c_iflag; v.c_oflag = termios->c_oflag; v.c_cflag = termios->c_cflag; v.c_lflag = termios->c_lflag; v.c_line = termios->c_line; memcpy(v.c_cc, termios->c_cc, NCC); return copy_to_user(termio, &v, sizeof(struct termio)); } #ifdef TCGETS2 __weak int user_termios_to_kernel_termios(struct ktermios *k, struct termios2 __user *u) { return copy_from_user(k, u, sizeof(struct termios2)); } __weak int kernel_termios_to_user_termios(struct termios2 __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios2)); } __weak int user_termios_to_kernel_termios_1(struct ktermios *k, struct termios __user *u) { return copy_from_user(k, u, sizeof(struct termios)); } __weak int kernel_termios_to_user_termios_1(struct termios __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios)); } #else __weak int user_termios_to_kernel_termios(struct ktermios *k, struct termios __user *u) { return copy_from_user(k, u, sizeof(struct termios)); } __weak int kernel_termios_to_user_termios(struct termios __user *u, struct ktermios *k) { return copy_to_user(u, k, sizeof(struct termios)); } #endif /* TCGETS2 */ /** * set_termios - set termios values for a tty * @tty: terminal device * @arg: user data * @opt: option information * * Helper function to prepare termios data and run necessary other functions * before using tty_set_termios() to do the actual changes. * * Locking: called functions take &tty_struct->ldisc_sem and * &tty_struct->termios_rwsem locks * * Returns: 0 on success, an error otherwise */ static int set_termios(struct tty_struct *tty, void __user *arg, int opt) { struct ktermios tmp_termios; struct tty_ldisc *ld; int retval = tty_check_change(tty); if (retval) return retval; down_read(&tty->termios_rwsem); tmp_termios = tty->termios; up_read(&tty->termios_rwsem); if (opt & TERMIOS_TERMIO) { if (user_termio_to_kernel_termios(&tmp_termios, (struct termio __user *)arg)) return -EFAULT; #ifdef TCGETS2 } else if (opt & TERMIOS_OLD) { if (user_termios_to_kernel_termios_1(&tmp_termios, (struct termios __user *)arg)) return -EFAULT; } else { if (user_termios_to_kernel_termios(&tmp_termios, (struct termios2 __user *)arg)) return -EFAULT; } #else } else if (user_termios_to_kernel_termios(&tmp_termios, (struct termios __user *)arg)) return -EFAULT; #endif /* If old style Bfoo values are used then load c_ispeed/c_ospeed * with the real speed so its unconditionally usable */ tmp_termios.c_ispeed = tty_termios_input_baud_rate(&tmp_termios); tmp_termios.c_ospeed = tty_termios_baud_rate(&tmp_termios); if (opt & (TERMIOS_FLUSH|TERMIOS_WAIT)) { retry_write_wait: retval = wait_event_interruptible(tty->write_wait, !tty_chars_in_buffer(tty)); if (retval < 0) return retval; if (tty_write_lock(tty, false) < 0) goto retry_write_wait; /* Racing writer? */ if (tty_chars_in_buffer(tty)) { tty_write_unlock(tty); goto retry_write_wait; } ld = tty_ldisc_ref(tty); if (ld != NULL) { if ((opt & TERMIOS_FLUSH) && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); tty_ldisc_deref(ld); } if ((opt & TERMIOS_WAIT) && tty->ops->wait_until_sent) { tty->ops->wait_until_sent(tty, 0); if (signal_pending(current)) { tty_write_unlock(tty); return -ERESTARTSYS; } } tty_set_termios(tty, &tmp_termios); tty_write_unlock(tty); } else { tty_set_termios(tty, &tmp_termios); } /* FIXME: Arguably if tmp_termios == tty->termios AND the actual requested termios was not tmp_termios then we may want to return an error as no user requested change has succeeded */ return 0; } static void copy_termios(struct tty_struct *tty, struct ktermios *kterm) { down_read(&tty->termios_rwsem); *kterm = tty->termios; up_read(&tty->termios_rwsem); } static void copy_termios_locked(struct tty_struct *tty, struct ktermios *kterm) { down_read(&tty->termios_rwsem); *kterm = tty->termios_locked; up_read(&tty->termios_rwsem); } static int get_termio(struct tty_struct *tty, struct termio __user *termio) { struct ktermios kterm; copy_termios(tty, &kterm); if (kernel_termios_to_user_termio(termio, &kterm)) return -EFAULT; return 0; } #ifdef TIOCGETP /* * These are deprecated, but there is limited support.. * * The "sg_flags" translation is a joke.. */ static int get_sgflags(struct tty_struct *tty) { int flags = 0; if (!L_ICANON(tty)) { if (L_ISIG(tty)) flags |= 0x02; /* cbreak */ else flags |= 0x20; /* raw */ } if (L_ECHO(tty)) flags |= 0x08; /* echo */ if (O_OPOST(tty)) if (O_ONLCR(tty)) flags |= 0x10; /* crmod */ return flags; } static int get_sgttyb(struct tty_struct *tty, struct sgttyb __user *sgttyb) { struct sgttyb tmp; down_read(&tty->termios_rwsem); tmp.sg_ispeed = tty->termios.c_ispeed; tmp.sg_ospeed = tty->termios.c_ospeed; tmp.sg_erase = tty->termios.c_cc[VERASE]; tmp.sg_kill = tty->termios.c_cc[VKILL]; tmp.sg_flags = get_sgflags(tty); up_read(&tty->termios_rwsem); return copy_to_user(sgttyb, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static void set_sgflags(struct ktermios *termios, int flags) { termios->c_iflag = ICRNL | IXON; termios->c_oflag = 0; termios->c_lflag = ISIG | ICANON; if (flags & 0x02) { /* cbreak */ termios->c_iflag = 0; termios->c_lflag &= ~ICANON; } if (flags & 0x08) { /* echo */ termios->c_lflag |= ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN; } if (flags & 0x10) { /* crmod */ termios->c_oflag |= OPOST | ONLCR; } if (flags & 0x20) { /* raw */ termios->c_iflag = 0; termios->c_lflag &= ~(ISIG | ICANON); } if (!(termios->c_lflag & ICANON)) { termios->c_cc[VMIN] = 1; termios->c_cc[VTIME] = 0; } } /** * set_sgttyb - set legacy terminal values * @tty: tty structure * @sgttyb: pointer to old style terminal structure * * Updates a terminal from the legacy BSD style terminal information structure. * * Locking: &tty_struct->termios_rwsem * * Returns: 0 on success, an error otherwise */ static int set_sgttyb(struct tty_struct *tty, struct sgttyb __user *sgttyb) { int retval; struct sgttyb tmp; struct ktermios termios; retval = tty_check_change(tty); if (retval) return retval; if (copy_from_user(&tmp, sgttyb, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); termios = tty->termios; termios.c_cc[VERASE] = tmp.sg_erase; termios.c_cc[VKILL] = tmp.sg_kill; set_sgflags(&termios, tmp.sg_flags); /* Try and encode into Bfoo format */ tty_termios_encode_baud_rate(&termios, termios.c_ispeed, termios.c_ospeed); up_write(&tty->termios_rwsem); tty_set_termios(tty, &termios); return 0; } #endif #ifdef TIOCGETC static int get_tchars(struct tty_struct *tty, struct tchars __user *tchars) { struct tchars tmp; down_read(&tty->termios_rwsem); tmp.t_intrc = tty->termios.c_cc[VINTR]; tmp.t_quitc = tty->termios.c_cc[VQUIT]; tmp.t_startc = tty->termios.c_cc[VSTART]; tmp.t_stopc = tty->termios.c_cc[VSTOP]; tmp.t_eofc = tty->termios.c_cc[VEOF]; tmp.t_brkc = tty->termios.c_cc[VEOL2]; /* what is brkc anyway? */ up_read(&tty->termios_rwsem); return copy_to_user(tchars, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static int set_tchars(struct tty_struct *tty, struct tchars __user *tchars) { struct tchars tmp; if (copy_from_user(&tmp, tchars, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); tty->termios.c_cc[VINTR] = tmp.t_intrc; tty->termios.c_cc[VQUIT] = tmp.t_quitc; tty->termios.c_cc[VSTART] = tmp.t_startc; tty->termios.c_cc[VSTOP] = tmp.t_stopc; tty->termios.c_cc[VEOF] = tmp.t_eofc; tty->termios.c_cc[VEOL2] = tmp.t_brkc; /* what is brkc anyway? */ up_write(&tty->termios_rwsem); return 0; } #endif #ifdef TIOCGLTC static int get_ltchars(struct tty_struct *tty, struct ltchars __user *ltchars) { struct ltchars tmp; down_read(&tty->termios_rwsem); tmp.t_suspc = tty->termios.c_cc[VSUSP]; /* what is dsuspc anyway? */ tmp.t_dsuspc = tty->termios.c_cc[VSUSP]; tmp.t_rprntc = tty->termios.c_cc[VREPRINT]; /* what is flushc anyway? */ tmp.t_flushc = tty->termios.c_cc[VEOL2]; tmp.t_werasc = tty->termios.c_cc[VWERASE]; tmp.t_lnextc = tty->termios.c_cc[VLNEXT]; up_read(&tty->termios_rwsem); return copy_to_user(ltchars, &tmp, sizeof(tmp)) ? -EFAULT : 0; } static int set_ltchars(struct tty_struct *tty, struct ltchars __user *ltchars) { struct ltchars tmp; if (copy_from_user(&tmp, ltchars, sizeof(tmp))) return -EFAULT; down_write(&tty->termios_rwsem); tty->termios.c_cc[VSUSP] = tmp.t_suspc; /* what is dsuspc anyway? */ tty->termios.c_cc[VEOL2] = tmp.t_dsuspc; tty->termios.c_cc[VREPRINT] = tmp.t_rprntc; /* what is flushc anyway? */ tty->termios.c_cc[VEOL2] = tmp.t_flushc; tty->termios.c_cc[VWERASE] = tmp.t_werasc; tty->termios.c_cc[VLNEXT] = tmp.t_lnextc; up_write(&tty->termios_rwsem); return 0; } #endif /** * tty_change_softcar - carrier change ioctl helper * @tty: tty to update * @enable: enable/disable %CLOCAL * * Perform a change to the %CLOCAL state and call into the driver layer to make * it visible. * * Locking: &tty_struct->termios_rwsem. * * Returns: 0 on success, an error otherwise */ static int tty_change_softcar(struct tty_struct *tty, bool enable) { int ret = 0; struct ktermios old; tcflag_t bit = enable ? CLOCAL : 0; down_write(&tty->termios_rwsem); old = tty->termios; tty->termios.c_cflag &= ~CLOCAL; tty->termios.c_cflag |= bit; if (tty->ops->set_termios) tty->ops->set_termios(tty, &old); if (C_CLOCAL(tty) != bit) ret = -EINVAL; up_write(&tty->termios_rwsem); return ret; } /** * tty_mode_ioctl - mode related ioctls * @tty: tty for the ioctl * @cmd: command * @arg: ioctl argument * * Perform non-line discipline specific mode control ioctls. This is designed * to be called by line disciplines to ensure they provide consistent mode * setting. */ int tty_mode_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct tty_struct *real_tty; void __user *p = (void __user *)arg; int ret = 0; struct ktermios kterm; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) real_tty = tty->link; else real_tty = tty; switch (cmd) { #ifdef TIOCGETP case TIOCGETP: return get_sgttyb(real_tty, (struct sgttyb __user *) arg); case TIOCSETP: case TIOCSETN: return set_sgttyb(real_tty, (struct sgttyb __user *) arg); #endif #ifdef TIOCGETC case TIOCGETC: return get_tchars(real_tty, p); case TIOCSETC: return set_tchars(real_tty, p); #endif #ifdef TIOCGLTC case TIOCGLTC: return get_ltchars(real_tty, p); case TIOCSLTC: return set_ltchars(real_tty, p); #endif case TCSETSF: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT | TERMIOS_OLD); case TCSETSW: return set_termios(real_tty, p, TERMIOS_WAIT | TERMIOS_OLD); case TCSETS: return set_termios(real_tty, p, TERMIOS_OLD); #ifndef TCGETS2 case TCGETS: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; #else case TCGETS: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TCGETS2: copy_termios(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios2 __user *)arg, &kterm)) ret = -EFAULT; return ret; case TCSETSF2: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT); case TCSETSW2: return set_termios(real_tty, p, TERMIOS_WAIT); case TCSETS2: return set_termios(real_tty, p, 0); #endif case TCGETA: return get_termio(real_tty, p); case TCSETAF: return set_termios(real_tty, p, TERMIOS_FLUSH | TERMIOS_WAIT | TERMIOS_TERMIO); case TCSETAW: return set_termios(real_tty, p, TERMIOS_WAIT | TERMIOS_TERMIO); case TCSETA: return set_termios(real_tty, p, TERMIOS_TERMIO); #ifndef TCGETS2 case TIOCGLCKTRMIOS: copy_termios_locked(real_tty, &kterm); if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TIOCSLCKTRMIOS: if (!capable(CAP_SYS_ADMIN)) return -EPERM; copy_termios_locked(real_tty, &kterm); if (user_termios_to_kernel_termios(&kterm, (struct termios __user *) arg)) return -EFAULT; down_write(&real_tty->termios_rwsem); real_tty->termios_locked = kterm; up_write(&real_tty->termios_rwsem); return 0; #else case TIOCGLCKTRMIOS: copy_termios_locked(real_tty, &kterm); if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm)) ret = -EFAULT; return ret; case TIOCSLCKTRMIOS: if (!capable(CAP_SYS_ADMIN)) return -EPERM; copy_termios_locked(real_tty, &kterm); if (user_termios_to_kernel_termios_1(&kterm, (struct termios __user *) arg)) return -EFAULT; down_write(&real_tty->termios_rwsem); real_tty->termios_locked = kterm; up_write(&real_tty->termios_rwsem); return ret; #endif #ifdef TCGETX case TCGETX: case TCSETX: case TCSETXW: case TCSETXF: return -ENOTTY; #endif case TIOCGSOFTCAR: copy_termios(real_tty, &kterm); ret = put_user((kterm.c_cflag & CLOCAL) ? 1 : 0, (int __user *)arg); return ret; case TIOCSSOFTCAR: if (get_user(arg, (unsigned int __user *) arg)) return -EFAULT; return tty_change_softcar(real_tty, arg); default: return -ENOIOCTLCMD; } } EXPORT_SYMBOL_GPL(tty_mode_ioctl); /* Caller guarantees ldisc reference is held */ static int __tty_perform_flush(struct tty_struct *tty, unsigned long arg) { struct tty_ldisc *ld = tty->ldisc; switch (arg) { case TCIFLUSH: if (ld && ld->ops->flush_buffer) { ld->ops->flush_buffer(tty); tty_unthrottle(tty); } break; case TCIOFLUSH: if (ld && ld->ops->flush_buffer) { ld->ops->flush_buffer(tty); tty_unthrottle(tty); } fallthrough; case TCOFLUSH: tty_driver_flush_buffer(tty); break; default: return -EINVAL; } return 0; } int tty_perform_flush(struct tty_struct *tty, unsigned long arg) { struct tty_ldisc *ld; int retval = tty_check_change(tty); if (retval) return retval; ld = tty_ldisc_ref_wait(tty); retval = __tty_perform_flush(tty, arg); if (ld) tty_ldisc_deref(ld); return retval; } EXPORT_SYMBOL_GPL(tty_perform_flush); int n_tty_ioctl_helper(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { int retval; switch (cmd) { case TCXONC: retval = tty_check_change(tty); if (retval) return retval; switch (arg) { case TCOOFF: spin_lock_irq(&tty->flow.lock); if (!tty->flow.tco_stopped) { tty->flow.tco_stopped = true; __stop_tty(tty); } spin_unlock_irq(&tty->flow.lock); break; case TCOON: spin_lock_irq(&tty->flow.lock); if (tty->flow.tco_stopped) { tty->flow.tco_stopped = false; __start_tty(tty); } spin_unlock_irq(&tty->flow.lock); break; case TCIOFF: if (STOP_CHAR(tty) != __DISABLED_CHAR) retval = tty_send_xchar(tty, STOP_CHAR(tty)); break; case TCION: if (START_CHAR(tty) != __DISABLED_CHAR) retval = tty_send_xchar(tty, START_CHAR(tty)); break; default: return -EINVAL; } return retval; case TCFLSH: retval = tty_check_change(tty); if (retval) return retval; return __tty_perform_flush(tty, arg); default: /* Try the mode commands */ return tty_mode_ioctl(tty, cmd, arg); } } EXPORT_SYMBOL(n_tty_ioctl_helper);
1340 2 109 1342 1339 1339 1340 1340 1340 1339 132 133 128 50 30 129 129 127 3 4 128 40 53 59 72 101 2 65 124 52 2 2 4 2 12 163 160 160 157 160 1 139 53 53 25 38 177 1 164 3 163 161 4 3 13 53 125 125 1 1 1 1 1 1 1 1 1 5 1 181 1 1 1 175 169 5 3 1 1 2 187 1 17 10 7 410 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 // SPDX-License-Identifier: GPL-2.0-only /* * Packet matching code. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/capability.h> #include <linux/in.h> #include <linux/skbuff.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/netdevice.h> #include <linux/module.h> #include <linux/poison.h> #include <net/ipv6.h> #include <net/compat.h> #include <linux/uaccess.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/err.h> #include <linux/cpumask.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); void *ip6t_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ip6t, IP6T); } EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool ip6_packet_match(const struct sk_buff *skb, const char *indev, const char *outdev, const struct ip6t_ip6 *ip6info, unsigned int *protoff, u16 *fragoff, bool *hotdrop) { unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); if (NF_INVF(ip6info, IP6T_INV_SRCIP, ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, &ip6info->src)) || NF_INVF(ip6info, IP6T_INV_DSTIP, ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, &ip6info->dst))) return false; ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0)) return false; /* ... might want to do something with class and flowlabel here ... */ /* look for the desired protocol header */ if (ip6info->flags & IP6T_F_PROTO) { int protohdr; unsigned short _frag_off; protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL); if (protohdr < 0) { if (_frag_off == 0) *hotdrop = true; return false; } *fragoff = _frag_off; if (ip6info->proto == protohdr) { if (ip6info->invflags & IP6T_INV_PROTO) return false; return true; } /* We need match for the '-p all', too! */ if ((ip6info->proto != 0) && !(ip6info->invflags & IP6T_INV_PROTO)) return false; } return true; } /* should be ip6 safe */ static bool ip6_checkentry(const struct ip6t_ip6 *ipv6) { if (ipv6->flags & ~IP6T_F_MASK) return false; if (ipv6->invflags & ~IP6T_INV_MASK) return false; return true; } static unsigned int ip6t_error(struct sk_buff *skb, const struct xt_action_param *par) { net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); return NF_DROP; } static inline struct ip6t_entry * get_entry(const void *base, unsigned int offset) { return (struct ip6t_entry *)(base + offset); } /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ static inline bool unconditional(const struct ip6t_entry *e) { static const struct ip6t_ip6 uncond; return e->target_offset == sizeof(struct ip6t_entry) && memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0; } static inline const struct xt_entry_target * ip6t_get_target_c(const struct ip6t_entry *e) { return ip6t_get_target((struct ip6t_entry *)e); } #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* This cries for unification! */ static const char *const hooknames[] = { [NF_INET_PRE_ROUTING] = "PREROUTING", [NF_INET_LOCAL_IN] = "INPUT", [NF_INET_FORWARD] = "FORWARD", [NF_INET_LOCAL_OUT] = "OUTPUT", [NF_INET_POST_ROUTING] = "POSTROUTING", }; enum nf_ip_trace_comments { NF_IP6_TRACE_COMMENT_RULE, NF_IP6_TRACE_COMMENT_RETURN, NF_IP6_TRACE_COMMENT_POLICY, }; static const char *const comments[] = { [NF_IP6_TRACE_COMMENT_RULE] = "rule", [NF_IP6_TRACE_COMMENT_RETURN] = "return", [NF_IP6_TRACE_COMMENT_POLICY] = "policy", }; static const struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { .level = LOGLEVEL_WARNING, .logflags = NF_LOG_DEFAULT_MASK, }, }, }; /* Mildly perf critical (only if packet tracing is on) */ static inline int get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; } else if (s == e) { (*rulenum)++; if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP6_TRACE_COMMENT_POLICY] : comments[NF_IP6_TRACE_COMMENT_RETURN]; } return 1; } else (*rulenum)++; return 0; } static void trace_packet(struct net *net, const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, const char *tablename, const struct xt_table_info *private, const struct ip6t_entry *e) { const struct ip6t_entry *root; const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP6_TRACE_COMMENT_RULE]; xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) if (get_chainname_rulenum(iter, e, hookname, &chainname, &comment, &rulenum) != 0) break; nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } #endif static inline struct ip6t_entry * ip6t_next_entry(const struct ip6t_entry *entry) { return (void *)entry + entry->next_offset; } /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ip6t_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; const void *table_base; struct ip6t_entry *e, **jumpstack; unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ stackidx = 0; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ acpar.fragoff = 0; acpar.hotdrop = false; acpar.state = state; WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; /* Switch to alternate jumpstack if we're being invoked via TEE. * TEE issues XT_CONTINUE verdict on original skb so we must not * clobber the jumpstack. * * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; struct xt_counters *counter; WARN_ON(!e); acpar.thoff = 0; if (!ip6_packet_match(skb, indev, outdev, &e->ipv6, &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) { no_match: e = ip6t_next_entry(e); continue; } xt_ematch_foreach(ematch, e) { acpar.match = ematch->u.kernel.match; acpar.matchinfo = ematch->data; if (!acpar.match->match(skb, &acpar)) goto no_match; } counter = xt_get_this_cpu_counter(&e->counters); ADD_COUNTER(*counter, skb->len, 1); t = ip6t_get_target_c(e); WARN_ON(!t->u.kernel.target); #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) trace_packet(state->net, skb, hook, state->in, state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { int v; v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { verdict = (unsigned int)(-v) - 1; break; } if (stackidx == 0) e = get_entry(table_base, private->underflow[hook]); else e = ip6t_next_entry(jumpstack[--stackidx]); continue; } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { if (unlikely(stackidx >= private->stacksize)) { verdict = NF_DROP; break; } jumpstack[stackidx++] = e; } e = get_entry(table_base, v); continue; } acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); if (verdict == XT_CONTINUE) e = ip6t_next_entry(e); else /* Verdict */ break; } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); if (acpar.hotdrop) return NF_DROP; else return verdict; } /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0, unsigned int *offsets) { unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset to 0 as we leave), and comefrom to save source hook bitmask */ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; /* Set initial back pointer. */ e->counters.pcnt = pos; for (;;) { const struct xt_standard_target *t = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && t->verdict < 0) || visited) { unsigned int oldpos, size; /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; /* We're at the start. */ if (pos == oldpos) goto next; e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; pos += size; } else { int newpos = t->verdict; if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } } next: ; } return 1; } static void cleanup_match(struct xt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV6; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); } static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; par->match = m->u.kernel.match; par->matchinfo = m->data; return xt_check_match(par, m->u.match_size - sizeof(*m), ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); } static int find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); if (IS_ERR(match)) return PTR_ERR(match); m->u.kernel.match = match; ret = check_match(m, par); if (ret) goto err; return 0; err: module_put(m->u.kernel.match->me); return ret; } static int check_target(struct ip6t_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = ip6t_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, .family = NFPROTO_IPV6, }; return xt_check_target(&par, t->u.target_size - sizeof(*t), e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); } static int find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; j = 0; memset(&mtpar, 0, sizeof(mtpar)); mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV6; xt_ematch_foreach(ematch, e) { ret = find_check_match(ematch, &mtpar); if (ret != 0) goto cleanup_matches; ++j; } t = ip6t_get_target(e); target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto cleanup_matches; } t->u.kernel.target = target; ret = check_target(e, net, name); if (ret) goto err; return 0; err: module_put(t->u.kernel.target->me); cleanup_matches: xt_ematch_foreach(ematch, e) { if (j-- == 0) break; cleanup_match(ematch, net); } xt_percpu_counter_free(&e->counters); return ret; } static bool check_underflow(const struct ip6t_entry *e) { const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(e)) return false; t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } static int check_entry_size_and_hooks(struct ip6t_entry *e, struct xt_table_info *newinfo, const unsigned char *base, const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, unsigned int valid_hooks) { unsigned int h; int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) return -EINVAL; if (!ip6_checkentry(&e->ipv6)) return -EINVAL; err = xt_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (err) return err; /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) return -EINVAL; newinfo->underflow[h] = underflows[h]; } } /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; return 0; } static void cleanup_entry(struct ip6t_entry *e, struct net *net) { struct xt_tgdtor_param par; struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ xt_ematch_foreach(ematch, e) cleanup_match(ematch, net); t = ip6t_get_target(e); par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV6; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ip6t_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct ip6t_entry *iter; unsigned int *offsets; unsigned int i; int ret = 0; newinfo->size = repl->size; newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } offsets = xt_alloc_entry_offsets(newinfo->number); if (!offsets) return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { ret = check_entry_size_and_hooks(iter, newinfo, entry0, entry0 + repl->size, repl->hook_entry, repl->underflow, repl->valid_hooks); if (ret != 0) goto out_free; if (i < repl->num_entries) offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ip6t_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } ret = -EINVAL; if (i != repl->num_entries) goto out_free; ret = xt_check_table_hooks(newinfo, repl->valid_hooks); if (ret) goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; goto out_free; } kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; ++i; } if (ret != 0) { xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0) break; cleanup_entry(iter, net); } return ret; } return ret; out_free: kvfree(offsets); return ret; } static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct ip6t_entry *iter; unsigned int cpu; unsigned int i; for_each_possible_cpu(cpu) { seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); bcnt = tmp->bcnt; pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; cond_resched(); } } } static void get_old_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct ip6t_entry *iter; unsigned int cpu, i; for_each_possible_cpu(cpu) { i = 0; xt_entry_foreach(iter, t->entries, t->size) { const struct xt_counters *tmp; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); ++i; } cond_resched(); } } static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); get_counters(private, counters); return counters; } static int copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { unsigned int off, num; const struct ip6t_entry *e; struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); loc_cpu_entry = private->entries; /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; const struct xt_entry_match *m; const struct xt_entry_target *t; e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; } if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], sizeof(counters[num])) != 0) { ret = -EFAULT; goto free_counters; } for (i = sizeof(struct ip6t_entry); i < e->target_offset; i += m->u.match_size) { m = (void *)e + i; if (xt_match_to_user(m, userptr + off + i)) { ret = -EFAULT; goto free_counters; } } t = ip6t_get_target_c(e); if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } } free_counters: vfree(counters); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; if (v > 0) v += xt_compat_calc_jump(AF_INET6, v); memcpy(dst, &v, sizeof(v)); } static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; if (cv > 0) cv -= xt_compat_calc_jump(AF_INET6, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } static int compat_calc_entry(const struct ip6t_entry *e, const struct xt_table_info *info, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_match *ematch; const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - base; xt_ematch_foreach(ematch, e) off += xt_compat_match_offset(ematch->u.kernel.match); t = ip6t_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(AF_INET6, entry_offset, off); if (ret) return ret; for (i = 0; i < NF_INET_NUMHOOKS; i++) { if (info->hook_entry[i] && (e < (struct ip6t_entry *)(base + info->hook_entry[i]))) newinfo->hook_entry[i] -= off; if (info->underflow[i] && (e < (struct ip6t_entry *)(base + info->underflow[i]))) newinfo->underflow[i] -= off; } return 0; } static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ip6t_entry *iter; const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; ret = xt_compat_init_offsets(AF_INET6, info->number); if (ret) return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) return ret; } return 0; } #endif static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; if (*len != sizeof(struct ip6t_getinfo)) return -EINVAL; if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(AF_INET6); #endif t = xt_request_find_table_lock(net, AF_INET6, name); if (!IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET6); private = &tmp; } #endif memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); memcpy(info.underflow, private->underflow, sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; strcpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; xt_table_unlock(t); module_put(t->me); } else ret = PTR_ERR(t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(AF_INET6); #endif return ret; } static int get_entries(struct net *net, struct ip6t_get_entries __user *uptr, const int *len) { int ret; struct ip6t_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct ip6t_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else ret = -EAGAIN; module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); return ret; } static int __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, unsigned int num_counters, void __user *counters_ptr) { int ret; struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; struct ip6t_entry *iter; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; } t = xt_request_find_table_lock(net, AF_INET6, name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } /* You lied! */ if (valid_hooks != t->valid_hooks) { ret = -EINVAL; goto put_module; } oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; /* Update module usage count based on number of rules */ if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); if ((oldinfo->number > oldinfo->initial_entries) && (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); xt_table_unlock(t); get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) { /* Silent error, can't fail, new table is already in place */ net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); } vfree(counters); return 0; put_module: module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: vfree(counters); out: return ret; } static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct ip6t_entry *iter; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; struct xt_table *t; const struct xt_table_info *private; int ret = 0; struct ip6t_entry *iter; unsigned int addend; paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET6, tmp.name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free; } local_bh_disable(); private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: vfree(paddc); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ip6t_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct xt_counters * */ struct compat_ip6t_entry entries[]; }; static int compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, unsigned int i) { struct xt_entry_target *t; struct compat_ip6t_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; const struct xt_entry_match *ematch; int ret = 0; origsize = *size; ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) return -EFAULT; *dstptr += sizeof(struct compat_ip6t_entry); *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); xt_ematch_foreach(ematch, e) { ret = xt_compat_match_to_user(ematch, dstptr, size); if (ret != 0) return ret; } target_offset = e->target_offset - (origsize - *size); t = ip6t_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) return ret; next_offset = e->next_offset - (origsize - *size); if (put_user(target_offset, &ce->target_offset) != 0 || put_user(next_offset, &ce->next_offset) != 0) return -EFAULT; return 0; } static int compat_find_calc_match(struct xt_entry_match *m, const struct ip6t_ip6 *ipv6, int *size) { struct xt_match *match; match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); if (IS_ERR(match)) return PTR_ERR(match); m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; } static void compat_release_entry(struct compat_ip6t_entry *e) { struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ xt_ematch_foreach(ematch, e) module_put(ematch->u.kernel.match->me); t = compat_ip6t_get_target(e); module_put(t->u.kernel.target->me); } static int check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, struct xt_table_info *newinfo, unsigned int *size, const unsigned char *base, const unsigned char *limit) { struct xt_entry_match *ematch; struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; int ret, off; if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct compat_ip6t_entry) + sizeof(struct compat_xt_entry_target)) return -EINVAL; if (!ip6_checkentry(&e->ipv6)) return -EINVAL; ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (ret) return ret; off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { ret = compat_find_calc_match(ematch, &e->ipv6, &off); if (ret != 0) goto release_matches; ++j; } t = compat_ip6t_get_target(e); target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto release_matches; } t->u.kernel.target = target; off += xt_compat_target_offset(target); *size += off; ret = xt_compat_add_offset(AF_INET6, entry_offset, off); if (ret) goto out; return 0; out: module_put(t->u.kernel.target->me); release_matches: xt_ematch_foreach(ematch, e) { if (j-- == 0) break; module_put(ematch->u.kernel.match->me); } return ret; } static void compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, unsigned int *size, struct xt_table_info *newinfo, unsigned char *base) { struct xt_entry_target *t; struct ip6t_entry *de; unsigned int origsize; int h; struct xt_entry_match *ematch; origsize = *size; de = *dstptr; memcpy(de, e, sizeof(struct ip6t_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); *dstptr += sizeof(struct ip6t_entry); *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); xt_ematch_foreach(ematch, e) xt_compat_match_from_user(ematch, dstptr, size); de->target_offset = e->target_offset - (origsize - *size); t = compat_ip6t_get_target(e); xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)de - base < newinfo->hook_entry[h]) newinfo->hook_entry[h] -= origsize - *size; if ((unsigned char *)de - base < newinfo->underflow[h]) newinfo->underflow[h] -= origsize - *size; } } static int translate_compat_table(struct net *net, struct xt_table_info **pinfo, void **pentry0, const struct compat_ip6t_replace *compatr) { unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; struct compat_ip6t_entry *iter0; struct ip6t_replace repl; unsigned int size; int ret; info = *pinfo; entry0 = *pentry0; size = compatr->size; info->number = compatr->num_entries; j = 0; xt_compat_lock(AF_INET6); ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries); if (ret) goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, entry0, entry0 + compatr->size); if (ret != 0) goto out_unlock; ++j; } ret = -EINVAL; if (j != compatr->num_entries) goto out_unlock; ret = -ENOMEM; newinfo = xt_alloc_table_info(size); if (!newinfo) goto out_unlock; memset(newinfo->entries, 0, size); newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; size = compatr->size; xt_entry_foreach(iter0, entry0, compatr->size) compat_copy_entry_from_user(iter0, &pos, &size, newinfo, entry1); /* all module references in entry0 are now gone. */ xt_compat_flush_offsets(AF_INET6); xt_compat_unlock(AF_INET6); memcpy(&repl, compatr, sizeof(*compatr)); for (i = 0; i < NF_INET_NUMHOOKS; i++) { repl.hook_entry[i] = newinfo->hook_entry[i]; repl.underflow[i] = newinfo->underflow[i]; } repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); return 0; free_newinfo: xt_free_table_info(newinfo); return ret; out_unlock: xt_compat_flush_offsets(AF_INET6); xt_compat_unlock(AF_INET6); xt_entry_foreach(iter0, entry0, compatr->size) { if (j-- == 0) break; compat_release_entry(iter0); } return ret; } static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct ip6t_entry *iter; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } struct compat_ip6t_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ip6t_entry entrytable[]; }; static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; unsigned int i = 0; struct ip6t_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); pos = userptr; size = total_size; xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) break; } vfree(counters); return ret; } static int compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, int *len) { int ret; struct compat_ip6t_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); else if (!ret) ret = -EAGAIN; xt_compat_flush_offsets(AF_INET6); module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); xt_compat_unlock(AF_INET6); return ret; } #endif static int do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_SET_REPLACE: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else #endif ret = do_replace(sock_net(sk), arg, len); break; case IP6T_SO_SET_ADD_COUNTERS: ret = do_add_counters(sock_net(sk), arg, len); break; default: ret = -EINVAL; } return ret; } static int do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_GET_INFO: ret = get_info(sock_net(sk), user, len); break; case IP6T_SO_GET_ENTRIES: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else #endif ret = get_entries(sock_net(sk), user, len); break; case IP6T_SO_GET_REVISION_MATCH: case IP6T_SO_GET_REVISION_TARGET: { struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { ret = -EINVAL; break; } if (copy_from_user(&rev, user, sizeof(rev)) != 0) { ret = -EFAULT; break; } rev.name[sizeof(rev.name)-1] = 0; if (cmd == IP6T_SO_GET_REVISION_TARGET) target = 1; else target = 0; try_then_request_module(xt_find_revision(AF_INET6, rev.name, rev.revision, target, &ret), "ip6t_%s", rev.name); break; } default: ret = -EINVAL; } return ret; } static void __ip6t_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; struct ip6t_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); } int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *template_ops) { struct nf_hook_ops *ops; unsigned int num_ops; int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) { xt_free_table_info(newinfo); return ret; } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct ip6t_entry *iter; xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } if (!template_ops) return 0; num_ops = hweight32(table->valid_hooks); if (num_ops == 0) { ret = -EINVAL; goto out_free; } ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; } for (i = 0; i < num_ops; i++) ops[i].priv = new_table; new_table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); if (ret != 0) goto out_free; return ret; out_free: __ip6t_unregister_table(net, new_table); return ret; } void ip6t_unregister_table_pre_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); if (table) nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ip6t_unregister_table_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); if (table) __ip6t_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ static struct xt_target ip6t_builtin_tg[] __read_mostly = { { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV6, #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, #endif }, { .name = XT_ERROR_TARGET, .target = ip6t_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV6, }, }; static struct nf_sockopt_ops ip6t_sockopts = { .pf = PF_INET6, .set_optmin = IP6T_BASE_CTL, .set_optmax = IP6T_SO_SET_MAX+1, .set = do_ip6t_set_ctl, .get_optmin = IP6T_BASE_CTL, .get_optmax = IP6T_SO_GET_MAX+1, .get = do_ip6t_get_ctl, .owner = THIS_MODULE, }; static int __net_init ip6_tables_net_init(struct net *net) { return xt_proto_init(net, NFPROTO_IPV6); } static void __net_exit ip6_tables_net_exit(struct net *net) { xt_proto_fini(net, NFPROTO_IPV6); } static struct pernet_operations ip6_tables_net_ops = { .init = ip6_tables_net_init, .exit = ip6_tables_net_exit, }; static int __init ip6_tables_init(void) { int ret; ret = register_pernet_subsys(&ip6_tables_net_ops); if (ret < 0) goto err1; /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); if (ret < 0) goto err2; /* Register setsockopt */ ret = nf_register_sockopt(&ip6t_sockopts); if (ret < 0) goto err4; return 0; err4: xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); err2: unregister_pernet_subsys(&ip6_tables_net_ops); err1: return ret; } static void __exit ip6_tables_fini(void) { nf_unregister_sockopt(&ip6t_sockopts); xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); unregister_pernet_subsys(&ip6_tables_net_ops); } EXPORT_SYMBOL(ip6t_register_table); EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); module_init(ip6_tables_init); module_exit(ip6_tables_fini);
1 1 1 1 1 4 4 3 3 3 8 111 2 1 4 3 3 9 9 2 2 1 6 2 2 3 3 2 1 1 2 7 1 2 4 1 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 // SPDX-License-Identifier: GPL-2.0-only /* * kvm eventfd support - use eventfd objects to signal various KVM events * * Copyright 2009 Novell. All Rights Reserved. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Author: * Gregory Haskins <ghaskins@novell.com> */ #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/kvm_irqfd.h> #include <linux/workqueue.h> #include <linux/syscalls.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/file.h> #include <linux/list.h> #include <linux/eventfd.h> #include <linux/kernel.h> #include <linux/srcu.h> #include <linux/slab.h> #include <linux/seqlock.h> #include <linux/irqbypass.h> #include <trace/events/kvm.h> #include <kvm/iodev.h> #ifdef CONFIG_HAVE_KVM_IRQFD static struct workqueue_struct *irqfd_cleanup_wq; bool __attribute__((weak)) kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) { return true; } static void irqfd_inject(struct work_struct *work) { struct kvm_kernel_irqfd *irqfd = container_of(work, struct kvm_kernel_irqfd, inject); struct kvm *kvm = irqfd->kvm; if (!irqfd->resampler) { kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, false); kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, false); } else kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, irqfd->gsi, 1, false); } static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) { struct kvm_kernel_irqfd *irqfd; list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, srcu_read_lock_held(&resampler->kvm->irq_srcu)) eventfd_signal(irqfd->resamplefd, 1); } /* * Since resampler irqfds share an IRQ source ID, we de-assert once * then notify all of the resampler irqfds using this GSI. We can't * do multiple de-asserts or we risk racing with incoming re-asserts. */ static void irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) { struct kvm_kernel_irqfd_resampler *resampler; struct kvm *kvm; int idx; resampler = container_of(kian, struct kvm_kernel_irqfd_resampler, notifier); kvm = resampler->kvm; kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, resampler->notifier.gsi, 0, false); idx = srcu_read_lock(&kvm->irq_srcu); irqfd_resampler_notify(resampler); srcu_read_unlock(&kvm->irq_srcu, idx); } static void irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) { struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; struct kvm *kvm = resampler->kvm; mutex_lock(&kvm->irqfds.resampler_lock); list_del_rcu(&irqfd->resampler_link); synchronize_srcu(&kvm->irq_srcu); if (list_empty(&resampler->list)) { list_del_rcu(&resampler->link); kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); /* * synchronize_srcu(&kvm->irq_srcu) already called * in kvm_unregister_irq_ack_notifier(). */ kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, resampler->notifier.gsi, 0, false); kfree(resampler); } mutex_unlock(&kvm->irqfds.resampler_lock); } /* * Race-free decouple logic (ordering is critical) */ static void irqfd_shutdown(struct work_struct *work) { struct kvm_kernel_irqfd *irqfd = container_of(work, struct kvm_kernel_irqfd, shutdown); struct kvm *kvm = irqfd->kvm; u64 cnt; /* Make sure irqfd has been initialized in assign path. */ synchronize_srcu(&kvm->irq_srcu); /* * Synchronize with the wait-queue and unhook ourselves to prevent * further events. */ eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); /* * We know no new events will be scheduled at this point, so block * until all previously outstanding events have completed */ flush_work(&irqfd->inject); if (irqfd->resampler) { irqfd_resampler_shutdown(irqfd); eventfd_ctx_put(irqfd->resamplefd); } /* * It is now safe to release the object's resources */ #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS irq_bypass_unregister_consumer(&irqfd->consumer); #endif eventfd_ctx_put(irqfd->eventfd); kfree(irqfd); } /* assumes kvm->irqfds.lock is held */ static bool irqfd_is_active(struct kvm_kernel_irqfd *irqfd) { return list_empty(&irqfd->list) ? false : true; } /* * Mark the irqfd as inactive and schedule it for removal * * assumes kvm->irqfds.lock is held */ static void irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) { BUG_ON(!irqfd_is_active(irqfd)); list_del_init(&irqfd->list); queue_work(irqfd_cleanup_wq, &irqfd->shutdown); } int __attribute__((weak)) kvm_arch_set_irq_inatomic( struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm, int irq_source_id, int level, bool line_status) { return -EWOULDBLOCK; } /* * Called with wqh->lock held and interrupts disabled */ static int irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct kvm_kernel_irqfd *irqfd = container_of(wait, struct kvm_kernel_irqfd, wait); __poll_t flags = key_to_poll(key); struct kvm_kernel_irq_routing_entry irq; struct kvm *kvm = irqfd->kvm; unsigned seq; int idx; int ret = 0; if (flags & EPOLLIN) { u64 cnt; eventfd_ctx_do_read(irqfd->eventfd, &cnt); idx = srcu_read_lock(&kvm->irq_srcu); do { seq = read_seqcount_begin(&irqfd->irq_entry_sc); irq = irqfd->irq_entry; } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); /* An event has been signaled, inject an interrupt */ if (kvm_arch_set_irq_inatomic(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false) == -EWOULDBLOCK) schedule_work(&irqfd->inject); srcu_read_unlock(&kvm->irq_srcu, idx); ret = 1; } if (flags & EPOLLHUP) { /* The eventfd is closing, detach from KVM */ unsigned long iflags; spin_lock_irqsave(&kvm->irqfds.lock, iflags); /* * We must check if someone deactivated the irqfd before * we could acquire the irqfds.lock since the item is * deactivated from the KVM side before it is unhooked from * the wait-queue. If it is already deactivated, we can * simply return knowing the other side will cleanup for us. * We cannot race against the irqfd going away since the * other side is required to acquire wqh->lock, which we hold */ if (irqfd_is_active(irqfd)) irqfd_deactivate(irqfd); spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); } return ret; } static void irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct kvm_kernel_irqfd *irqfd = container_of(pt, struct kvm_kernel_irqfd, pt); add_wait_queue_priority(wqh, &irqfd->wait); } /* Must be called under irqfds.lock */ static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) { struct kvm_kernel_irq_routing_entry *e; struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; int n_entries; n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); write_seqcount_begin(&irqfd->irq_entry_sc); e = entries; if (n_entries == 1) irqfd->irq_entry = *e; else irqfd->irq_entry.type = 0; write_seqcount_end(&irqfd->irq_entry_sc); } #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS void __attribute__((weak)) kvm_arch_irq_bypass_stop( struct irq_bypass_consumer *cons) { } void __attribute__((weak)) kvm_arch_irq_bypass_start( struct irq_bypass_consumer *cons) { } int __attribute__((weak)) kvm_arch_update_irqfd_routing( struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { return 0; } bool __attribute__((weak)) kvm_arch_irqfd_route_changed( struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { return true; } #endif static int kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { struct kvm_kernel_irqfd *irqfd, *tmp; struct fd f; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; int ret; __poll_t events; int idx; if (!kvm_arch_intc_initialized(kvm)) return -EAGAIN; if (!kvm_arch_irqfd_allowed(kvm, args)) return -EINVAL; irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); if (!irqfd) return -ENOMEM; irqfd->kvm = kvm; irqfd->gsi = args->gsi; INIT_LIST_HEAD(&irqfd->list); INIT_WORK(&irqfd->inject, irqfd_inject); INIT_WORK(&irqfd->shutdown, irqfd_shutdown); seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); f = fdget(args->fd); if (!f.file) { ret = -EBADF; goto out; } eventfd = eventfd_ctx_fileget(f.file); if (IS_ERR(eventfd)) { ret = PTR_ERR(eventfd); goto fail; } irqfd->eventfd = eventfd; if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { struct kvm_kernel_irqfd_resampler *resampler; resamplefd = eventfd_ctx_fdget(args->resamplefd); if (IS_ERR(resamplefd)) { ret = PTR_ERR(resamplefd); goto fail; } irqfd->resamplefd = resamplefd; INIT_LIST_HEAD(&irqfd->resampler_link); mutex_lock(&kvm->irqfds.resampler_lock); list_for_each_entry(resampler, &kvm->irqfds.resampler_list, link) { if (resampler->notifier.gsi == irqfd->gsi) { irqfd->resampler = resampler; break; } } if (!irqfd->resampler) { resampler = kzalloc(sizeof(*resampler), GFP_KERNEL_ACCOUNT); if (!resampler) { ret = -ENOMEM; mutex_unlock(&kvm->irqfds.resampler_lock); goto fail; } resampler->kvm = kvm; INIT_LIST_HEAD(&resampler->list); resampler->notifier.gsi = irqfd->gsi; resampler->notifier.irq_acked = irqfd_resampler_ack; INIT_LIST_HEAD(&resampler->link); list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list); kvm_register_irq_ack_notifier(kvm, &resampler->notifier); irqfd->resampler = resampler; } list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); synchronize_srcu(&kvm->irq_srcu); mutex_unlock(&kvm->irqfds.resampler_lock); } /* * Install our own custom wake-up handling so we are notified via * a callback whenever someone signals the underlying eventfd */ init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); spin_lock_irq(&kvm->irqfds.lock); ret = 0; list_for_each_entry(tmp, &kvm->irqfds.items, list) { if (irqfd->eventfd != tmp->eventfd) continue; /* This fd is used for another irq already. */ ret = -EBUSY; spin_unlock_irq(&kvm->irqfds.lock); goto fail; } idx = srcu_read_lock(&kvm->irq_srcu); irqfd_update(kvm, irqfd); list_add_tail(&irqfd->list, &kvm->irqfds.items); spin_unlock_irq(&kvm->irqfds.lock); /* * Check if there was an event already pending on the eventfd * before we registered, and trigger it as if we didn't miss it. */ events = vfs_poll(f.file, &irqfd->pt); if (events & EPOLLIN) schedule_work(&irqfd->inject); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS if (kvm_arch_has_irq_bypass()) { irqfd->consumer.token = (void *)irqfd->eventfd; irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; irqfd->consumer.stop = kvm_arch_irq_bypass_stop; irqfd->consumer.start = kvm_arch_irq_bypass_start; ret = irq_bypass_register_consumer(&irqfd->consumer); if (ret) pr_info("irq bypass consumer (token %p) registration fails: %d\n", irqfd->consumer.token, ret); } #endif srcu_read_unlock(&kvm->irq_srcu, idx); /* * do not drop the file until the irqfd is fully initialized, otherwise * we might race against the EPOLLHUP */ fdput(f); return 0; fail: if (irqfd->resampler) irqfd_resampler_shutdown(irqfd); if (resamplefd && !IS_ERR(resamplefd)) eventfd_ctx_put(resamplefd); if (eventfd && !IS_ERR(eventfd)) eventfd_ctx_put(eventfd); fdput(f); out: kfree(irqfd); return ret; } bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_ack_notifier *kian; int gsi, idx; idx = srcu_read_lock(&kvm->irq_srcu); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, link, srcu_read_lock_held(&kvm->irq_srcu)) if (kian->gsi == gsi) { srcu_read_unlock(&kvm->irq_srcu, idx); return true; } srcu_read_unlock(&kvm->irq_srcu, idx); return false; } EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { struct kvm_irq_ack_notifier *kian; hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, link, srcu_read_lock_held(&kvm->irq_srcu)) if (kian->gsi == gsi) kian->irq_acked(kian); } void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) { int gsi, idx; trace_kvm_ack_irq(irqchip, pin); idx = srcu_read_lock(&kvm->irq_srcu); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) kvm_notify_acked_gsi(kvm, gsi); srcu_read_unlock(&kvm->irq_srcu, idx); } void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian) { mutex_lock(&kvm->irq_lock); hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); mutex_unlock(&kvm->irq_lock); kvm_arch_post_irq_ack_notifier_list_update(kvm); } void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian) { mutex_lock(&kvm->irq_lock); hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); synchronize_srcu(&kvm->irq_srcu); kvm_arch_post_irq_ack_notifier_list_update(kvm); } #endif void kvm_eventfd_init(struct kvm *kvm) { #ifdef CONFIG_HAVE_KVM_IRQFD spin_lock_init(&kvm->irqfds.lock); INIT_LIST_HEAD(&kvm->irqfds.items); INIT_LIST_HEAD(&kvm->irqfds.resampler_list); mutex_init(&kvm->irqfds.resampler_lock); #endif INIT_LIST_HEAD(&kvm->ioeventfds); } #ifdef CONFIG_HAVE_KVM_IRQFD /* * shutdown any irqfd's that match fd+gsi */ static int kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) { struct kvm_kernel_irqfd *irqfd, *tmp; struct eventfd_ctx *eventfd; eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { /* * This clearing of irq_entry.type is needed for when * another thread calls kvm_irq_routing_update before * we flush workqueue below (we synchronize with * kvm_irq_routing_update using irqfds.lock). */ write_seqcount_begin(&irqfd->irq_entry_sc); irqfd->irq_entry.type = 0; write_seqcount_end(&irqfd->irq_entry_sc); irqfd_deactivate(irqfd); } } spin_unlock_irq(&kvm->irqfds.lock); eventfd_ctx_put(eventfd); /* * Block until we know all outstanding shutdown jobs have completed * so that we guarantee there will not be any more interrupts on this * gsi once this deassign function returns. */ flush_workqueue(irqfd_cleanup_wq); return 0; } int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) return -EINVAL; if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) return kvm_irqfd_deassign(kvm, args); return kvm_irqfd_assign(kvm, args); } /* * This function is called as the kvm VM fd is being released. Shutdown all * irqfds that still remain open */ void kvm_irqfd_release(struct kvm *kvm) { struct kvm_kernel_irqfd *irqfd, *tmp; spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) irqfd_deactivate(irqfd); spin_unlock_irq(&kvm->irqfds.lock); /* * Block until we know all outstanding shutdown jobs have completed * since we do not take a kvm* reference. */ flush_workqueue(irqfd_cleanup_wq); } /* * Take note of a change in irq routing. * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. */ void kvm_irq_routing_update(struct kvm *kvm) { struct kvm_kernel_irqfd *irqfd; spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry(irqfd, &kvm->irqfds.items, list) { #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS /* Under irqfds.lock, so can read irq_entry safely */ struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; #endif irqfd_update(kvm, irqfd); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS if (irqfd->producer && kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { int ret = kvm_arch_update_irqfd_routing( irqfd->kvm, irqfd->producer->irq, irqfd->gsi, 1); WARN_ON(ret); } #endif } spin_unlock_irq(&kvm->irqfds.lock); } bool kvm_notify_irqfd_resampler(struct kvm *kvm, unsigned int irqchip, unsigned int pin) { struct kvm_kernel_irqfd_resampler *resampler; int gsi, idx; idx = srcu_read_lock(&kvm->irq_srcu); gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); if (gsi != -1) { list_for_each_entry_srcu(resampler, &kvm->irqfds.resampler_list, link, srcu_read_lock_held(&kvm->irq_srcu)) { if (resampler->notifier.gsi == gsi) { irqfd_resampler_notify(resampler); srcu_read_unlock(&kvm->irq_srcu, idx); return true; } } } srcu_read_unlock(&kvm->irq_srcu, idx); return false; } /* * create a host-wide workqueue for issuing deferred shutdown requests * aggregated from all vm* instances. We need our own isolated * queue to ease flushing work items when a VM exits. */ int kvm_irqfd_init(void) { irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); if (!irqfd_cleanup_wq) return -ENOMEM; return 0; } void kvm_irqfd_exit(void) { destroy_workqueue(irqfd_cleanup_wq); } #endif /* * -------------------------------------------------------------------- * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. * * userspace can register a PIO/MMIO address with an eventfd for receiving * notification when the memory has been touched. * -------------------------------------------------------------------- */ struct _ioeventfd { struct list_head list; u64 addr; int length; struct eventfd_ctx *eventfd; u64 datamatch; struct kvm_io_device dev; u8 bus_idx; bool wildcard; }; static inline struct _ioeventfd * to_ioeventfd(struct kvm_io_device *dev) { return container_of(dev, struct _ioeventfd, dev); } static void ioeventfd_release(struct _ioeventfd *p) { eventfd_ctx_put(p->eventfd); list_del(&p->list); kfree(p); } static bool ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) { u64 _val; if (addr != p->addr) /* address must be precise for a hit */ return false; if (!p->length) /* length = 0 means only look at the address, so always a hit */ return true; if (len != p->length) /* address-range must be precise for a hit */ return false; if (p->wildcard) /* all else equal, wildcard is always a hit */ return true; /* otherwise, we have to actually compare the data */ BUG_ON(!IS_ALIGNED((unsigned long)val, len)); switch (len) { case 1: _val = *(u8 *)val; break; case 2: _val = *(u16 *)val; break; case 4: _val = *(u32 *)val; break; case 8: _val = *(u64 *)val; break; default: return false; } return _val == p->datamatch; } /* MMIO/PIO writes trigger an event if the addr/val match */ static int ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val) { struct _ioeventfd *p = to_ioeventfd(this); if (!ioeventfd_in_range(p, addr, len, val)) return -EOPNOTSUPP; eventfd_signal(p->eventfd, 1); return 0; } /* * This function is called as KVM is completely shutting down. We do not * need to worry about locking just nuke anything we have as quickly as possible */ static void ioeventfd_destructor(struct kvm_io_device *this) { struct _ioeventfd *p = to_ioeventfd(this); ioeventfd_release(p); } static const struct kvm_io_device_ops ioeventfd_ops = { .write = ioeventfd_write, .destructor = ioeventfd_destructor, }; /* assumes kvm->slots_lock held */ static bool ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) { struct _ioeventfd *_p; list_for_each_entry(_p, &kvm->ioeventfds, list) if (_p->bus_idx == p->bus_idx && _p->addr == p->addr && (!_p->length || !p->length || (_p->length == p->length && (_p->wildcard || p->wildcard || _p->datamatch == p->datamatch)))) return true; return false; } static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) { if (flags & KVM_IOEVENTFD_FLAG_PIO) return KVM_PIO_BUS; if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) return KVM_VIRTIO_CCW_NOTIFY_BUS; return KVM_MMIO_BUS; } static int kvm_assign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_ioeventfd *args) { struct eventfd_ctx *eventfd; struct _ioeventfd *p; int ret; eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); if (!p) { ret = -ENOMEM; goto fail; } INIT_LIST_HEAD(&p->list); p->addr = args->addr; p->bus_idx = bus_idx; p->length = args->len; p->eventfd = eventfd; /* The datamatch feature is optional, otherwise this is a wildcard */ if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) p->datamatch = args->datamatch; else p->wildcard = true; mutex_lock(&kvm->slots_lock); /* Verify that there isn't a match already */ if (ioeventfd_check_collision(kvm, p)) { ret = -EEXIST; goto unlock_fail; } kvm_iodevice_init(&p->dev, &ioeventfd_ops); ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, &p->dev); if (ret < 0) goto unlock_fail; kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; list_add_tail(&p->list, &kvm->ioeventfds); mutex_unlock(&kvm->slots_lock); return 0; unlock_fail: mutex_unlock(&kvm->slots_lock); kfree(p); fail: eventfd_ctx_put(eventfd); return ret; } static int kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_ioeventfd *args) { struct _ioeventfd *p; struct eventfd_ctx *eventfd; struct kvm_io_bus *bus; int ret = -ENOENT; bool wildcard; eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); mutex_lock(&kvm->slots_lock); list_for_each_entry(p, &kvm->ioeventfds, list) { if (p->bus_idx != bus_idx || p->eventfd != eventfd || p->addr != args->addr || p->length != args->len || p->wildcard != wildcard) continue; if (!p->wildcard && p->datamatch != args->datamatch) continue; kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); bus = kvm_get_bus(kvm, bus_idx); if (bus) bus->ioeventfd_count--; ret = 0; break; } mutex_unlock(&kvm->slots_lock); eventfd_ctx_put(eventfd); return ret; } static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags); int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); if (!args->len && bus_idx == KVM_MMIO_BUS) kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); return ret; } static int kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { enum kvm_bus bus_idx; int ret; bus_idx = ioeventfd_bus_from_flags(args->flags); /* must be natural-word sized, or 0 to ignore length */ switch (args->len) { case 0: case 1: case 2: case 4: case 8: break; default: return -EINVAL; } /* check for range overflow */ if (args->addr + args->len < args->addr) return -EINVAL; /* check for extra flags that we don't understand */ if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) return -EINVAL; /* ioeventfd with no length can't be combined with DATAMATCH */ if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) return -EINVAL; ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); if (ret) goto fail; /* When length is ignored, MMIO is also put on a separate bus, for * faster lookups. */ if (!args->len && bus_idx == KVM_MMIO_BUS) { ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); if (ret < 0) goto fast_fail; } return 0; fast_fail: kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); fail: return ret; } int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) return kvm_deassign_ioeventfd(kvm, args); return kvm_assign_ioeventfd(kvm, args); }
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 // SPDX-License-Identifier: GPL-2.0-only /* * stackglue.c * * Code which implements an OCFS2 specific interface to underlying * cluster stacks. * * Copyright (C) 2007, 2009 Oracle. All rights reserved. */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/fs.h> #include <linux/kobject.h> #include <linux/sysfs.h> #include <linux/sysctl.h> #include "ocfs2_fs.h" #include "stackglue.h" #define OCFS2_STACK_PLUGIN_O2CB "o2cb" #define OCFS2_STACK_PLUGIN_USER "user" #define OCFS2_MAX_HB_CTL_PATH 256 static struct ocfs2_protocol_version locking_max_version; static DEFINE_SPINLOCK(ocfs2_stack_lock); static LIST_HEAD(ocfs2_stack_list); static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; /* * The stack currently in use. If not null, active_stack->sp_count > 0, * the module is pinned, and the locking protocol cannot be changed. */ static struct ocfs2_stack_plugin *active_stack; static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; assert_spin_locked(&ocfs2_stack_lock); list_for_each_entry(p, &ocfs2_stack_list, sp_list) { if (!strcmp(p->sp_name, name)) return p; } return NULL; } static int ocfs2_stack_driver_request(const char *stack_name, const char *plugin_name) { int rc; struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); /* * If the stack passed by the filesystem isn't the selected one, * we can't continue. */ if (strcmp(stack_name, cluster_stack_name)) { rc = -EBUSY; goto out; } if (active_stack) { /* * If the active stack isn't the one we want, it cannot * be selected right now. */ if (!strcmp(active_stack->sp_name, plugin_name)) rc = 0; else rc = -EBUSY; goto out; } p = ocfs2_stack_lookup(plugin_name); if (!p || !try_module_get(p->sp_owner)) { rc = -ENOENT; goto out; } active_stack = p; rc = 0; out: /* If we found it, pin it */ if (!rc) active_stack->sp_count++; spin_unlock(&ocfs2_stack_lock); return rc; } /* * This function looks up the appropriate stack and makes it active. If * there is no stack, it tries to load it. It will fail if the stack still * cannot be found. It will also fail if a different stack is in use. */ static int ocfs2_stack_driver_get(const char *stack_name) { int rc; char *plugin_name = OCFS2_STACK_PLUGIN_O2CB; /* * Classic stack does not pass in a stack name. This is * compatible with older tools as well. */ if (!stack_name || !*stack_name) stack_name = OCFS2_STACK_PLUGIN_O2CB; if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) { printk(KERN_ERR "ocfs2 passed an invalid cluster stack label: \"%s\"\n", stack_name); return -EINVAL; } /* Anything that isn't the classic stack is a user stack */ if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB)) plugin_name = OCFS2_STACK_PLUGIN_USER; rc = ocfs2_stack_driver_request(stack_name, plugin_name); if (rc == -ENOENT) { request_module("ocfs2_stack_%s", plugin_name); rc = ocfs2_stack_driver_request(stack_name, plugin_name); } if (rc == -ENOENT) { printk(KERN_ERR "ocfs2: Cluster stack driver \"%s\" cannot be found\n", plugin_name); } else if (rc == -EBUSY) { printk(KERN_ERR "ocfs2: A different cluster stack is in use\n"); } return rc; } static void ocfs2_stack_driver_put(void) { spin_lock(&ocfs2_stack_lock); BUG_ON(active_stack == NULL); BUG_ON(active_stack->sp_count == 0); active_stack->sp_count--; if (!active_stack->sp_count) { module_put(active_stack->sp_owner); active_stack = NULL; } spin_unlock(&ocfs2_stack_lock); } int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) { int rc; spin_lock(&ocfs2_stack_lock); if (!ocfs2_stack_lookup(plugin->sp_name)) { plugin->sp_count = 0; plugin->sp_max_proto = locking_max_version; list_add(&plugin->sp_list, &ocfs2_stack_list); printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", plugin->sp_name); rc = 0; } else { printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n", plugin->sp_name); rc = -EEXIST; } spin_unlock(&ocfs2_stack_lock); return rc; } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) { struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); p = ocfs2_stack_lookup(plugin->sp_name); if (p) { BUG_ON(p != plugin); BUG_ON(plugin == active_stack); BUG_ON(plugin->sp_count != 0); list_del_init(&plugin->sp_list); printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n", plugin->sp_name); } else { printk(KERN_ERR "Stack \"%s\" is not registered\n", plugin->sp_name); } spin_unlock(&ocfs2_stack_lock); } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_proto) { struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); if (memcmp(max_proto, &locking_max_version, sizeof(struct ocfs2_protocol_version))) { BUG_ON(locking_max_version.pv_major != 0); locking_max_version = *max_proto; list_for_each_entry(p, &ocfs2_stack_list, sp_list) { p->sp_max_proto = locking_max_version; } } spin_unlock(&ocfs2_stack_lock); } EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_max_proto_version); /* * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take no argument * for the ast and bast functions. They will pass the lksb to the ast * and bast. The caller can wrap the lksb with their own structure to * get more information. */ int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, int mode, struct ocfs2_dlm_lksb *lksb, u32 flags, void *name, unsigned int namelen) { if (!lksb->lksb_conn) lksb->lksb_conn = conn; else BUG_ON(lksb->lksb_conn != conn); return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, name, namelen); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, struct ocfs2_dlm_lksb *lksb, u32 flags) { BUG_ON(lksb->lksb_conn == NULL); return active_stack->sp_ops->dlm_unlock(conn, lksb, flags); } EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); int ocfs2_dlm_lock_status(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_status(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); int ocfs2_dlm_lvb_valid(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lvb_valid(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb_valid); void *ocfs2_dlm_lvb(struct ocfs2_dlm_lksb *lksb) { return active_stack->sp_ops->lock_lvb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); void ocfs2_dlm_dump_lksb(struct ocfs2_dlm_lksb *lksb) { active_stack->sp_ops->dump_lksb(lksb); } EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); int ocfs2_stack_supports_plocks(void) { return active_stack && active_stack->sp_ops->plock; } EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); /* * ocfs2_plock() can only be safely called if * ocfs2_stack_supports_plocks() returned true */ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, struct file *file, int cmd, struct file_lock *fl) { WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); if (active_stack->sp_ops->plock) return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(ocfs2_plock); int ocfs2_cluster_connect(const char *stack_name, const char *cluster_name, int cluster_name_len, const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn) { int rc = 0; struct ocfs2_cluster_connection *new_conn; BUG_ON(group == NULL); BUG_ON(conn == NULL); BUG_ON(recovery_handler == NULL); if (grouplen > GROUP_NAME_MAX) { rc = -EINVAL; goto out; } if (memcmp(&lproto->lp_max_version, &locking_max_version, sizeof(struct ocfs2_protocol_version))) { rc = -EINVAL; goto out; } new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), GFP_KERNEL); if (!new_conn) { rc = -ENOMEM; goto out; } strscpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1); new_conn->cc_namelen = grouplen; if (cluster_name_len) strscpy(new_conn->cc_cluster_name, cluster_name, CLUSTER_NAME_MAX + 1); new_conn->cc_cluster_name_len = cluster_name_len; new_conn->cc_recovery_handler = recovery_handler; new_conn->cc_recovery_data = recovery_data; new_conn->cc_proto = lproto; /* Start the new connection at our maximum compatibility level */ new_conn->cc_version = lproto->lp_max_version; /* This will pin the stack driver if successful */ rc = ocfs2_stack_driver_get(stack_name); if (rc) goto out_free; rc = active_stack->sp_ops->connect(new_conn); if (rc) { ocfs2_stack_driver_put(); goto out_free; } *conn = new_conn; out_free: if (rc) kfree(new_conn); out: return rc; } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); /* The caller will ensure all nodes have the same cluster stack */ int ocfs2_cluster_connect_agnostic(const char *group, int grouplen, struct ocfs2_locking_protocol *lproto, void (*recovery_handler)(int node_num, void *recovery_data), void *recovery_data, struct ocfs2_cluster_connection **conn) { char *stack_name = NULL; if (cluster_stack_name[0]) stack_name = cluster_stack_name; return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen, lproto, recovery_handler, recovery_data, conn); } EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic); /* If hangup_pending is 0, the stack driver will be dropped */ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, int hangup_pending) { int ret; BUG_ON(conn == NULL); ret = active_stack->sp_ops->disconnect(conn); /* XXX Should we free it anyway? */ if (!ret) { kfree(conn); if (!hangup_pending) ocfs2_stack_driver_put(); } return ret; } EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); /* * Leave the group for this filesystem. This is executed by a userspace * program (stored in ocfs2_hb_ctl_path). */ static void ocfs2_leave_group(const char *group) { int ret; char *argv[5], *envp[3]; argv[0] = ocfs2_hb_ctl_path; argv[1] = "-K"; argv[2] = "-u"; argv[3] = (char *)group; argv[4] = NULL; /* minimal command environment taken from cpu_run_sbin_hotplug */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (ret < 0) { printk(KERN_ERR "ocfs2: Error %d running user helper " "\"%s %s %s %s\"\n", ret, argv[0], argv[1], argv[2], argv[3]); } } /* * Hangup is a required post-umount. ocfs2-tools software expects the * filesystem to call "ocfs2_hb_ctl" during unmount. This happens * regardless of whether the DLM got started, so we can't do it * in ocfs2_cluster_disconnect(). The ocfs2_leave_group() function does * the actual work. */ void ocfs2_cluster_hangup(const char *group, int grouplen) { BUG_ON(group == NULL); BUG_ON(group[grouplen] != '\0'); ocfs2_leave_group(group); /* cluster_disconnect() was called with hangup_pending==1 */ ocfs2_stack_driver_put(); } EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn, unsigned int *node) { return active_stack->sp_ops->this_node(conn, node); } EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); /* * Sysfs bits */ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); if (locking_max_version.pv_major) ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", locking_max_version.pv_major, locking_max_version.pv_minor); spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_max_locking_protocol = __ATTR(max_locking_protocol, S_IRUGO, ocfs2_max_locking_protocol_show, NULL); static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0, total = 0, remain = PAGE_SIZE; struct ocfs2_stack_plugin *p; spin_lock(&ocfs2_stack_lock); list_for_each_entry(p, &ocfs2_stack_list, sp_list) { ret = snprintf(buf, remain, "%s\n", p->sp_name); if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; } total += ret; remain -= ret; } spin_unlock(&ocfs2_stack_lock); return total; } static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = __ATTR(loaded_cluster_plugins, S_IRUGO, ocfs2_loaded_cluster_plugins_show, NULL); static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret = 0; spin_lock(&ocfs2_stack_lock); if (active_stack) { ret = snprintf(buf, PAGE_SIZE, "%s\n", active_stack->sp_name); if (ret >= PAGE_SIZE) ret = -E2BIG; } spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_active_cluster_plugin = __ATTR(active_cluster_plugin, S_IRUGO, ocfs2_active_cluster_plugin_show, NULL); static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t ret; spin_lock(&ocfs2_stack_lock); ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name); spin_unlock(&ocfs2_stack_lock); return ret; } static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { size_t len = count; ssize_t ret; if (len == 0) return len; if (buf[len - 1] == '\n') len--; if ((len != OCFS2_STACK_LABEL_LEN) || (strnlen(buf, len) != len)) return -EINVAL; spin_lock(&ocfs2_stack_lock); if (active_stack) { if (!strncmp(buf, cluster_stack_name, len)) ret = count; else ret = -EBUSY; } else { memcpy(cluster_stack_name, buf, len); ret = count; } spin_unlock(&ocfs2_stack_lock); return ret; } static struct kobj_attribute ocfs2_attr_cluster_stack = __ATTR(cluster_stack, S_IRUGO | S_IWUSR, ocfs2_cluster_stack_show, ocfs2_cluster_stack_store); static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "1\n"); } static struct kobj_attribute ocfs2_attr_dlm_recover_support = __ATTR(dlm_recover_callback_support, S_IRUGO, ocfs2_dlm_recover_show, NULL); static struct attribute *ocfs2_attrs[] = { &ocfs2_attr_max_locking_protocol.attr, &ocfs2_attr_loaded_cluster_plugins.attr, &ocfs2_attr_active_cluster_plugin.attr, &ocfs2_attr_cluster_stack.attr, &ocfs2_attr_dlm_recover_support.attr, NULL, }; static const struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; struct kset *ocfs2_kset; EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { kset_unregister(ocfs2_kset); } static int ocfs2_sysfs_init(void) { int ret; ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj); if (!ocfs2_kset) return -ENOMEM; ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group); if (ret) goto error; return 0; error: kset_unregister(ocfs2_kset); return ret; } /* * Sysctl bits * * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path. The 'nm' doesn't * make as much sense in a multiple cluster stack world, but it's safer * and easier to preserve the name. */ static struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", .data = ocfs2_hb_ctl_path, .maxlen = OCFS2_MAX_HB_CTL_PATH, .mode = 0644, .proc_handler = proc_dostring, }, { } }; static struct ctl_table_header *ocfs2_table_header; /* * Initialization */ static int __init ocfs2_stack_glue_init(void) { int ret; strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); ocfs2_table_header = register_sysctl("fs/ocfs2/nm", ocfs2_nm_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); return -ENOMEM; /* or something. */ } ret = ocfs2_sysfs_init(); if (ret) unregister_sysctl_table(ocfs2_table_header); return ret; } static void __exit ocfs2_stack_glue_exit(void) { memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); if (ocfs2_table_header) unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); MODULE_DESCRIPTION("ocfs2 cluster stack glue layer"); MODULE_LICENSE("GPL"); module_init(ocfs2_stack_glue_init); module_exit(ocfs2_stack_glue_exit);
538 541 534 529 11208 11202 9126 9125 1276 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0-or-later #define pr_fmt(fmt) "ref_tracker: " fmt #include <linux/export.h> #include <linux/list_sort.h> #include <linux/ref_tracker.h> #include <linux/slab.h> #include <linux/stacktrace.h> #include <linux/stackdepot.h> #define REF_TRACKER_STACK_ENTRIES 16 #define STACK_BUF_SIZE 1024 struct ref_tracker { struct list_head head; /* anchor into dir->list or dir->quarantine */ bool dead; depot_stack_handle_t alloc_stack_handle; depot_stack_handle_t free_stack_handle; }; struct ref_tracker_dir_stats { int total; int count; struct { depot_stack_handle_t stack_handle; unsigned int count; } stacks[]; }; static struct ref_tracker_dir_stats * ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit) { struct ref_tracker_dir_stats *stats; struct ref_tracker *tracker; stats = kmalloc(struct_size(stats, stacks, limit), GFP_NOWAIT | __GFP_NOWARN); if (!stats) return ERR_PTR(-ENOMEM); stats->total = 0; stats->count = 0; list_for_each_entry(tracker, &dir->list, head) { depot_stack_handle_t stack = tracker->alloc_stack_handle; int i; ++stats->total; for (i = 0; i < stats->count; ++i) if (stats->stacks[i].stack_handle == stack) break; if (i >= limit) continue; if (i >= stats->count) { stats->stacks[i].stack_handle = stack; stats->stacks[i].count = 0; ++stats->count; } ++stats->stacks[i].count; } return stats; } struct ostream { char *buf; int size, used; }; #define pr_ostream(stream, fmt, args...) \ ({ \ struct ostream *_s = (stream); \ \ if (!_s->buf) { \ pr_err(fmt, ##args); \ } else { \ int ret, len = _s->size - _s->used; \ ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \ _s->used += min(ret, len); \ } \ }) static void __ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir, unsigned int display_limit, struct ostream *s) { struct ref_tracker_dir_stats *stats; unsigned int i = 0, skipped; depot_stack_handle_t stack; char *sbuf; lockdep_assert_held(&dir->lock); if (list_empty(&dir->list)) return; stats = ref_tracker_get_stats(dir, display_limit); if (IS_ERR(stats)) { pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n", dir->name, dir, stats); return; } sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); for (i = 0, skipped = stats->total; i < stats->count; ++i) { stack = stats->stacks[i].stack_handle; if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4)) sbuf[0] = 0; pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir, stats->stacks[i].count, stats->total, sbuf); skipped -= stats->stacks[i].count; } if (skipped) pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n", dir->name, dir, skipped, stats->total); kfree(sbuf); kfree(stats); } void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir, unsigned int display_limit) { struct ostream os = {}; __ref_tracker_dir_pr_ostream(dir, display_limit, &os); } EXPORT_SYMBOL(ref_tracker_dir_print_locked); void ref_tracker_dir_print(struct ref_tracker_dir *dir, unsigned int display_limit) { unsigned long flags; spin_lock_irqsave(&dir->lock, flags); ref_tracker_dir_print_locked(dir, display_limit); spin_unlock_irqrestore(&dir->lock, flags); } EXPORT_SYMBOL(ref_tracker_dir_print); int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size) { struct ostream os = { .buf = buf, .size = size }; unsigned long flags; spin_lock_irqsave(&dir->lock, flags); __ref_tracker_dir_pr_ostream(dir, 16, &os); spin_unlock_irqrestore(&dir->lock, flags); return os.used; } EXPORT_SYMBOL(ref_tracker_dir_snprint); void ref_tracker_dir_exit(struct ref_tracker_dir *dir) { struct ref_tracker *tracker, *n; unsigned long flags; bool leak = false; dir->dead = true; spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); kfree(tracker); dir->quarantine_avail++; } if (!list_empty(&dir->list)) { ref_tracker_dir_print_locked(dir, 16); leak = true; list_for_each_entry_safe(tracker, n, &dir->list, head) { list_del(&tracker->head); kfree(tracker); } } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); int ref_tracker_alloc(struct ref_tracker_dir *dir, struct ref_tracker **trackerp, gfp_t gfp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; struct ref_tracker *tracker; unsigned int nr_entries; gfp_t gfp_mask = gfp | __GFP_NOWARN; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_inc(&dir->no_tracker); return 0; } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); if (unlikely(!tracker)) { pr_err_once("memory allocation failure, unreliable refcount tracker.\n"); refcount_inc(&dir->untracked); return -ENOMEM; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp); spin_lock_irqsave(&dir->lock, flags); list_add(&tracker->head, &dir->list); spin_unlock_irqrestore(&dir->lock, flags); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_alloc); int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; depot_stack_handle_t stack_handle; struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); if (!trackerp) { refcount_dec(&dir->no_tracker); return 0; } tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; } nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); stack_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT | __GFP_NOWARN); spin_lock_irqsave(&dir->lock, flags); if (tracker->dead) { pr_err("reference already released.\n"); if (tracker->alloc_stack_handle) { pr_err("allocated in:\n"); stack_depot_print(tracker->alloc_stack_handle); } if (tracker->free_stack_handle) { pr_err("freed in:\n"); stack_depot_print(tracker->free_stack_handle); } spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(1); return -EINVAL; } tracker->dead = true; tracker->free_stack_handle = stack_handle; list_move_tail(&tracker->head, &dir->quarantine); if (!dir->quarantine_avail) { tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head); list_del(&tracker->head); } else { dir->quarantine_avail--; tracker = NULL; } spin_unlock_irqrestore(&dir->lock, flags); kfree(tracker); return 0; } EXPORT_SYMBOL_GPL(ref_tracker_free);
9 140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Unified UUID/GUID definition * * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> const guid_t guid_null; EXPORT_SYMBOL(guid_null); const uuid_t uuid_null; EXPORT_SYMBOL(uuid_null); const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; /** * generate_random_uuid - generate a random UUID * @uuid: where to put the generated UUID * * Random UUID interface * * Used to create a Boot ID or a filesystem UUID/GUID, but can be * useful for other kernel drivers. */ void generate_random_uuid(unsigned char uuid[16]) { get_random_bytes(uuid, 16); /* Set UUID version to 4 --- truly random generation */ uuid[6] = (uuid[6] & 0x0F) | 0x40; /* Set the UUID variant to DCE */ uuid[8] = (uuid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_uuid); void generate_random_guid(unsigned char guid[16]) { get_random_bytes(guid, 16); /* Set GUID version to 4 --- truly random generation */ guid[7] = (guid[7] & 0x0F) | 0x40; /* Set the GUID variant to DCE */ guid[8] = (guid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_guid); static void __uuid_gen_common(__u8 b[16]) { get_random_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } void guid_gen(guid_t *lu) { __uuid_gen_common(lu->b); /* version 4 : random generation */ lu->b[7] = (lu->b[7] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(guid_gen); void uuid_gen(uuid_t *bu) { __uuid_gen_common(bu->b); /* version 4 : random generation */ bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_gen); /** * uuid_is_valid - checks if a UUID string is valid * @uuid: UUID string to check * * Description: * It checks if the UUID string is following the format: * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx * * where x is a hex digit. * * Return: true if input is valid UUID string. */ bool uuid_is_valid(const char *uuid) { unsigned int i; for (i = 0; i < UUID_STRING_LEN; i++) { if (i == 8 || i == 13 || i == 18 || i == 23) { if (uuid[i] != '-') return false; } else if (!isxdigit(uuid[i])) { return false; } } return true; } EXPORT_SYMBOL(uuid_is_valid); static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16]) { static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; unsigned int i; if (!uuid_is_valid(uuid)) return -EINVAL; for (i = 0; i < 16; i++) { int hi = hex_to_bin(uuid[si[i] + 0]); int lo = hex_to_bin(uuid[si[i] + 1]); b[ei[i]] = (hi << 4) | lo; } return 0; } int guid_parse(const char *uuid, guid_t *u) { return __uuid_parse(uuid, u->b, guid_index); } EXPORT_SYMBOL(guid_parse); int uuid_parse(const char *uuid, uuid_t *u) { return __uuid_parse(uuid, u->b, uuid_index); } EXPORT_SYMBOL(uuid_parse);
4 4 4 1 3 3 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_cgroup.c Control Group Classifier * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/cls_cgroup.h> #include <net/tc_wrapper.h> struct cls_cgroup_head { u32 handle; struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_proto *tp; struct rcu_work rwork; }; TC_INDIRECT_SCOPE int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); u32 classid = task_get_classid(skb); if (unlikely(!head)) return -1; if (!classid) return -1; if (!tcf_em_tree_match(skb, &head->ematches, NULL)) return -1; res->classid = classid; res->class = 0; return tcf_exts_exec(skb, &head->exts, res); } static void *cls_cgroup_get(struct tcf_proto *tp, u32 handle) { return NULL; } static int cls_cgroup_init(struct tcf_proto *tp) { return 0; } static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = { [TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED }, }; static void __cls_cgroup_destroy(struct cls_cgroup_head *head) { tcf_exts_destroy(&head->exts); tcf_em_tree_destroy(&head->ematches); tcf_exts_put_net(&head->exts); kfree(head); } static void cls_cgroup_destroy_work(struct work_struct *work) { struct cls_cgroup_head *head = container_of(to_rcu_work(work), struct cls_cgroup_head, rwork); rtnl_lock(); __cls_cgroup_destroy(head); rtnl_unlock(); } static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; struct cls_cgroup_head *head = rtnl_dereference(tp->root); struct cls_cgroup_head *new; int err; if (!tca[TCA_OPTIONS]) return -EINVAL; if (!head && !handle) return -EINVAL; if (head && handle != head->handle) return -ENOENT; new = kzalloc(sizeof(*head), GFP_KERNEL); if (!new) return -ENOBUFS; err = tcf_exts_init(&new->exts, net, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); if (err < 0) goto errout; new->handle = handle; new->tp = tp; err = nla_parse_nested_deprecated(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], cgroup_policy, NULL); if (err < 0) goto errout; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, flags, extack); if (err < 0) goto errout; err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &new->ematches); if (err < 0) goto errout; rcu_assign_pointer(tp->root, new); if (head) { tcf_exts_get_net(&head->exts); tcf_queue_work(&head->rwork, cls_cgroup_destroy_work); } return 0; errout: tcf_exts_destroy(&new->exts); kfree(new); return err; } static void cls_cgroup_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); /* Head can still be NULL due to cls_cgroup_init(). */ if (head) { if (tcf_exts_get_net(&head->exts)) tcf_queue_work(&head->rwork, cls_cgroup_destroy_work); else __cls_cgroup_destroy(head); } } static int cls_cgroup_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); if (arg->count < arg->skip) goto skip; if (!head) return; if (arg->fn(tp, head, arg) < 0) { arg->stop = 1; return; } skip: arg->count++; } static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); struct nlattr *nest; t->tcm_handle = head->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (tcf_exts_dump(skb, &head->exts) < 0 || tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &head->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { .kind = "cgroup", .init = cls_cgroup_init, .change = cls_cgroup_change, .classify = cls_cgroup_classify, .destroy = cls_cgroup_destroy, .get = cls_cgroup_get, .delete = cls_cgroup_delete, .walk = cls_cgroup_walk, .dump = cls_cgroup_dump, .owner = THIS_MODULE, }; static int __init init_cgroup_cls(void) { return register_tcf_proto_ops(&cls_cgroup_ops); } static void __exit exit_cgroup_cls(void) { unregister_tcf_proto_ops(&cls_cgroup_ops); } module_init(init_cgroup_cls); module_exit(exit_cgroup_cls); MODULE_DESCRIPTION("TC cgroup classifier"); MODULE_LICENSE("GPL");
4 25 19 20 18 21 3 4 2 34 4 18 10 1 3 31 47 93 67 7 21 3 3 24 2 35 9 24 27 52 172 138 46 27 15 53 21 5 10 8 4 2 7 10 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 // SPDX-License-Identifier: GPL-2.0-or-later /* scm.c - Socket level control messages processing. * * Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Alignment and value checking mods by Craig Metz */ #include <linux/module.h> #include <linux/signal.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/net.h> #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/security.h> #include <linux/pid_namespace.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/io_uring.h> #include <linux/uaccess.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> /* * Only allow a user to send credentials, that they could set with * setu(g)id. */ static __inline__ int scm_check_creds(struct ucred *creds) { const struct cred *cred = current_cred(); kuid_t uid = make_kuid(cred->user_ns, creds->uid); kgid_t gid = make_kgid(cred->user_ns, creds->gid); if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; if ((creds->pid == task_tgid_vnr(current) || ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) { return 0; } return -EPERM; } static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) { int *fdp = (int*)CMSG_DATA(cmsg); struct scm_fp_list *fpl = *fplp; struct file **fpp; int i, num; num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int); if (num <= 0) return 0; if (num > SCM_MAX_FD) return -EINVAL; if (!fpl) { fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); if (!fpl) return -ENOMEM; *fplp = fpl; fpl->count = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; } fpp = &fpl->fp[fpl->count]; if (fpl->count + num > fpl->max) return -EINVAL; /* * Verify the descriptors and increment the usage count. */ for (i=0; i< num; i++) { int fd = fdp[i]; struct file *file; if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; /* don't allow io_uring files */ if (io_uring_get_socket(file)) { fput(file); return -EINVAL; } *fpp++ = file; fpl->count++; } if (!fpl->user) fpl->user = get_uid(current_user()); return num; } void __scm_destroy(struct scm_cookie *scm) { struct scm_fp_list *fpl = scm->fp; int i; if (fpl) { scm->fp = NULL; for (i=fpl->count-1; i>=0; i--) fput(fpl->fp[i]); free_uid(fpl->user); kfree(fpl); } } EXPORT_SYMBOL(__scm_destroy); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) { const struct proto_ops *ops = READ_ONCE(sock->ops); struct cmsghdr *cmsg; int err; for_each_cmsghdr(cmsg, msg) { err = -EINVAL; /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ /* The first check was omitted in <= 2.2.5. The reasoning was that parser checks cmsg_len in any case, so that additional check would be work duplication. But if cmsg_level is not SOL_SOCKET, we do not check for too short ancillary data object at all! Oops. OK, let's add it... */ if (!CMSG_OK(msg, cmsg)) goto error; if (cmsg->cmsg_level != SOL_SOCKET) continue; switch (cmsg->cmsg_type) { case SCM_RIGHTS: if (!ops || ops->family != PF_UNIX) goto error; err=scm_fp_copy(cmsg, &p->fp); if (err<0) goto error; break; case SCM_CREDENTIALS: { struct ucred creds; kuid_t uid; kgid_t gid; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) goto error; memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred)); err = scm_check_creds(&creds); if (err) goto error; p->creds.pid = creds.pid; if (!p->pid || pid_vnr(p->pid) != creds.pid) { struct pid *pid; err = -ESRCH; pid = find_get_pid(creds.pid); if (!pid) goto error; put_pid(p->pid); p->pid = pid; } err = -EINVAL; uid = make_kuid(current_user_ns(), creds.uid); gid = make_kgid(current_user_ns(), creds.gid); if (!uid_valid(uid) || !gid_valid(gid)) goto error; p->creds.uid = uid; p->creds.gid = gid; break; } default: goto error; } } if (p->fp && !p->fp->count) { kfree(p->fp); p->fp = NULL; } return 0; error: scm_destroy(p); return err; } EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { int cmlen = CMSG_LEN(len); if (msg->msg_flags & MSG_CMSG_COMPAT) return put_cmsg_compat(msg, level, type, len, data); if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } if (msg->msg_controllen < cmlen) { msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } if (msg->msg_control_is_user) { struct cmsghdr __user *cm = msg->msg_control_user; check_object_size(data, cmlen - sizeof(*cm), true); if (!user_write_access_begin(cm, cmlen)) goto efault; unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); unsafe_put_user(level, &cm->cmsg_level, efault_end); unsafe_put_user(type, &cm->cmsg_type, efault_end); unsafe_copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm), efault_end); user_write_access_end(); } else { struct cmsghdr *cm = msg->msg_control; cm->cmsg_level = level; cm->cmsg_type = type; cm->cmsg_len = cmlen; memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); } cmlen = min(CMSG_SPACE(len), msg->msg_controllen); if (msg->msg_control_is_user) msg->msg_control_user += cmlen; else msg->msg_control += cmlen; msg->msg_controllen -= cmlen; return 0; efault_end: user_write_access_end(); efault: return -EFAULT; } EXPORT_SYMBOL(put_cmsg); void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; int i; for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); } EXPORT_SYMBOL(put_cmsg_scm_timestamping64); void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping tss; int i; for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); } EXPORT_SYMBOL(put_cmsg_scm_timestamping); static int scm_max_fds(struct msghdr *msg) { if (msg->msg_controllen <= sizeof(struct cmsghdr)) return 0; return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int); } void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm = (__force struct cmsghdr __user *)msg->msg_control_user; unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); int __user *cmsg_data = CMSG_USER_DATA(cm); int err = 0, i; /* no use for FD passing from kernel space callers */ if (WARN_ON_ONCE(!msg->msg_control_is_user)) return; if (msg->msg_flags & MSG_CMSG_COMPAT) { scm_detach_fds_compat(msg, scm); return; } for (i = 0; i < fdmax; i++) { err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } if (i > 0) { int cmlen = CMSG_LEN(i * sizeof(int)); err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) err = put_user(cmlen, &cm->cmsg_len); if (!err) { cmlen = CMSG_SPACE(i * sizeof(int)); if (msg->msg_controllen < cmlen) cmlen = msg->msg_controllen; msg->msg_control_user += cmlen; msg->msg_controllen -= cmlen; } } if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) msg->msg_flags |= MSG_CTRUNC; /* * All of the files that fit in the message have had their usage counts * incremented, so we just free the list. */ __scm_destroy(scm); } EXPORT_SYMBOL(scm_detach_fds); struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) { struct scm_fp_list *new_fpl; int i; if (!fpl) return NULL; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), GFP_KERNEL_ACCOUNT); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); } return new_fpl; } EXPORT_SYMBOL(scm_fp_dup);
5 1 4 4 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 2 4 1 4 1 3 3 1 4 1 1 7 7 1 1 5 3 4 4 4 1 1 4 4 4 1 2 1 1 36 2 28 1 31 2 1 3 1 1 2 2 23 2 3 27 23 4 26 26 26 27 27 27 27 1 26 3 25 21 21 23 21 2 1 1 16 1 3 3 16 17 2 17 3 1 15 14 15 15 27 28 27 7 20 1 58 1 42 46 45 9 2 46 4 5 2 3 39 42 1 22 18 3 5 4 28 5 1 29 2 4 21 1 1 1 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> #include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> #include <linux/fsnotify_backend.h> #include <linux/init.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <linux/sched/signal.h> #include <linux/memcontrol.h> #include <linux/statfs.h> #include <linux/exportfs.h> #include <asm/ioctls.h> #include "../../mount.h" #include "../fdinfo.h" #include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_GROUPS 128 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 /* * Legacy fanotify marks limits (8192) is per group and we introduced a tunable * limit of marks per user, similar to inotify. Effectively, the legacy limit * of fanotify marks per user is <max marks per group> * <max groups per user>. * This default limit (1M) also happens to match the increased limit of inotify * max_user_watches since v5.10. */ #define FANOTIFY_DEFAULT_MAX_USER_MARKS \ (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) /* * Most of the memory cost of adding an inode mark is pinning the marked inode. * The size of the filesystem inode struct is not uniform across filesystems, * so double the size of a VFS inode is used as a conservative approximation. */ #define INODE_MARK_COST (2 * sizeof(struct inode)) /* configurable via /proc/sys/fs/fanotify/ */ static int fanotify_max_queued_events __read_mostly; #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long ft_zero = 0; static long ft_int_max = INT_MAX; static struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], .maxlen = sizeof(long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &ft_zero, .extra2 = &ft_int_max, }, { .procname = "max_user_marks", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], .maxlen = sizeof(long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &ft_zero, .extra2 = &ft_int_max, }, { .procname = "max_queued_events", .data = &fanotify_max_queued_events, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, { } }; static void __init fanotify_sysctls_init(void) { register_sysctl("fs/fanotify", fanotify_table); } #else #define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ /* * All flags that may be specified in parameter event_f_flags of fanotify_init. * * Internal and external open flags are stored together in field f_flags of * struct file. Only external open flags shall be allowed in event_f_flags. * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be * excluded. */ #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ O_ACCMODE | O_APPEND | O_NONBLOCK | \ __O_SYNC | O_DSYNC | O_CLOEXEC | \ O_LARGEFILE | O_NOATIME ) extern const struct fsnotify_ops fanotify_fsnotify_ops; struct kmem_cache *fanotify_mark_cache __ro_after_init; struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; struct kmem_cache *fanotify_path_event_cachep __ro_after_init; struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) #define FANOTIFY_PIDFD_INFO_HDR_LEN \ sizeof(struct fanotify_event_info_pidfd) #define FANOTIFY_ERROR_INFO_LEN \ (sizeof(struct fanotify_event_info_error)) static int fanotify_fid_info_len(int fh_len, int name_len) { int info_len = fh_len; if (name_len) info_len += name_len + 1; return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); } /* FAN_RENAME may have one or two dir+name info records */ static int fanotify_dir_name_info_len(struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); int dir_fh_len = fanotify_event_dir_fh_len(event); int dir2_fh_len = fanotify_event_dir2_fh_len(event); int info_len = 0; if (dir_fh_len) info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); if (dir2_fh_len) info_len += fanotify_fid_info_len(dir2_fh_len, info->name2_len); return info_len; } static size_t fanotify_event_len(unsigned int info_mode, struct fanotify_event *event) { size_t event_len = FAN_EVENT_METADATA_LEN; int fh_len; int dot_len = 0; if (!info_mode) return event_len; if (fanotify_is_error_event(event->mask)) event_len += FANOTIFY_ERROR_INFO_LEN; if (fanotify_event_has_any_dir_fh(event)) { event_len += fanotify_dir_name_info_len(event); } else if ((info_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not recorded in * event on a directory, we will report the name ".". */ dot_len = 1; } if (info_mode & FAN_REPORT_PIDFD) event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; if (fanotify_event_has_object_fh(event)) { fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); } return event_len; } /* * Remove an hashed event from merge hash table. */ static void fanotify_unhash_event(struct fsnotify_group *group, struct fanotify_event *event) { assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, group, event, fanotify_event_hash_bucket(group, event)); if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) return; hlist_del_init(&event->merge_list); } /* * Get an fanotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count * is not large enough. When permission event is dequeued, its state is * updated accordingly. */ static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t count) { size_t event_size; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); spin_lock(&group->notification_lock); fsn_event = fsnotify_peek_first_event(group); if (!fsn_event) goto out; event = FANOTIFY_E(fsn_event); event_size = fanotify_event_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); goto out; } /* * Held the notification_lock the whole time, so this is the * same event we peeked above. */ fsnotify_remove_first_event(group); if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; if (fanotify_is_hashed_event(event->mask)) fanotify_unhash_event(group, event); out: spin_unlock(&group->notification_lock); return event; } static int create_fd(struct fsnotify_group *group, const struct path *path, struct file **file) { int client_fd; struct file *new_file; client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); if (client_fd < 0) return client_fd; /* * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. */ new_file = dentry_open(path, group->fanotify_data.f_flags | __FMODE_NONOTIFY, current_cred()); if (IS_ERR(new_file)) { /* * we still send an event even if we can't open the file. this * can happen when say tasks are gone and we try to open their * /proc files or we try to open a WRONLY file like in sysfs * we just send the errno to userspace since there isn't much * else we can do. */ put_unused_fd(client_fd); client_fd = PTR_ERR(new_file); } else { *file = new_file; } return client_fd; } static int process_access_response_info(const char __user *info, size_t info_len, struct fanotify_response_info_audit_rule *friar) { if (info_len != sizeof(*friar)) return -EINVAL; if (copy_from_user(friar, info, sizeof(*friar))) return -EFAULT; if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE) return -EINVAL; if (friar->hdr.pad != 0) return -EINVAL; if (friar->hdr.len != sizeof(*friar)) return -EINVAL; return info_len; } /* * Finish processing of permission event by setting it to ANSWERED state and * drop group->notification_lock. */ static void finish_permission_event(struct fsnotify_group *group, struct fanotify_perm_event *event, u32 response, struct fanotify_response_info_audit_rule *friar) __releases(&group->notification_lock) { bool destroy = false; assert_spin_locked(&group->notification_lock); event->response = response & ~FAN_INFO; if (response & FAN_INFO) memcpy(&event->audit_rule, friar, sizeof(*friar)); if (event->state == FAN_EVENT_CANCELED) destroy = true; else event->state = FAN_EVENT_ANSWERED; spin_unlock(&group->notification_lock); if (destroy) fsnotify_destroy_event(group, &event->fae.fse); } static int process_access_response(struct fsnotify_group *group, struct fanotify_response *response_struct, const char __user *info, size_t info_len) { struct fanotify_perm_event *event; int fd = response_struct->fd; u32 response = response_struct->response; int ret = info_len; struct fanotify_response_info_audit_rule friar; pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__, group, fd, response, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the * timeout */ if (response & ~FANOTIFY_RESPONSE_VALID_MASK) return -EINVAL; switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: case FAN_DENY: break; default: return -EINVAL; } if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; if (response & FAN_INFO) { ret = process_access_response_info(info, info_len, &friar); if (ret < 0) return ret; if (fd == FAN_NOFD) return ret; } else { ret = 0; } if (fd < 0) return -EINVAL; spin_lock(&group->notification_lock); list_for_each_entry(event, &group->fanotify_data.access_list, fae.fse.list) { if (event->fd != fd) continue; list_del_init(&event->fae.fse.list); finish_permission_event(group, event, response, &friar); wake_up(&group->fanotify_data.access_waitq); return ret; } spin_unlock(&group->notification_lock); return -ENOENT; } static size_t copy_error_info_to_user(struct fanotify_event *event, char __user *buf, int count) { struct fanotify_event_info_error info = { }; struct fanotify_error_event *fee = FANOTIFY_EE(event); info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; info.hdr.len = FANOTIFY_ERROR_INFO_LEN; if (WARN_ON(count < info.hdr.len)) return -EFAULT; info.error = fee->error; info.error_count = fee->err_count; if (copy_to_user(buf, &info, sizeof(info))) return -EFAULT; return info.hdr.len; } static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, int info_type, const char *name, size_t name_len, char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf; size_t fh_len = fh ? fh->len : 0; size_t info_len = fanotify_fid_info_len(fh_len, name_len); size_t len = info_len; pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", __func__, fh_len, name_len, info_len, count); if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT; /* * Copy event info fid header followed by variable sized file handle * and optionally followed by variable sized filename. */ switch (info_type) { case FAN_EVENT_INFO_TYPE_FID: case FAN_EVENT_INFO_TYPE_DFID: if (WARN_ON_ONCE(name_len)) return -EFAULT; break; case FAN_EVENT_INFO_TYPE_DFID_NAME: case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: if (WARN_ON_ONCE(!name || !name_len)) return -EFAULT; break; default: return -EFAULT; } info.hdr.info_type = info_type; info.hdr.len = len; info.fsid = *fsid; if (copy_to_user(buf, &info, sizeof(info))) return -EFAULT; buf += sizeof(info); len -= sizeof(info); if (WARN_ON_ONCE(len < sizeof(handle))) return -EFAULT; handle.handle_type = fh->type; handle.handle_bytes = fh_len; /* Mangle handle_type for bad file_handle */ if (!fh_len) handle.handle_type = FILEID_INVALID; if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT; buf += sizeof(handle); len -= sizeof(handle); if (WARN_ON_ONCE(len < fh_len)) return -EFAULT; /* * For an inline fh and inline file name, copy through stack to exclude * the copy from usercopy hardening protections. */ fh_buf = fanotify_fh_buf(fh); if (fh_len <= FANOTIFY_INLINE_FH_LEN) { memcpy(bounce, fh_buf, fh_len); fh_buf = bounce; } if (copy_to_user(buf, fh_buf, fh_len)) return -EFAULT; buf += fh_len; len -= fh_len; if (name_len) { /* Copy the filename with terminating null */ name_len++; if (WARN_ON_ONCE(len < name_len)) return -EFAULT; if (copy_to_user(buf, name, name_len)) return -EFAULT; buf += name_len; len -= name_len; } /* Pad with 0's */ WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); if (len > 0 && clear_user(buf, len)) return -EFAULT; return info_len; } static int copy_pidfd_info_to_user(int pidfd, char __user *buf, size_t count) { struct fanotify_event_info_pidfd info = { }; size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; if (WARN_ON_ONCE(info_len > count)) return -EFAULT; info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; info.hdr.len = info_len; info.pidfd = pidfd; if (copy_to_user(buf, &info, info_len)) return -EFAULT; return info_len; } static int copy_info_records_to_user(struct fanotify_event *event, struct fanotify_info *info, unsigned int info_mode, int pidfd, char __user *buf, size_t count) { int ret, total_bytes = 0, info_type = 0; unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; /* * Event info records order is as follows: * 1. dir fid + name * 2. (optional) new dir fid + new name * 3. (optional) child fid */ if (fanotify_event_has_dir_fh(event)) { info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : FAN_EVENT_INFO_TYPE_DFID; /* FAN_RENAME uses special info types */ if (event->mask & FAN_RENAME) info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME; ret = copy_fid_info_to_user(fanotify_event_fsid(event), fanotify_info_dir_fh(info), info_type, fanotify_info_name(info), info->name_len, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } /* New dir fid+name may be reported in addition to old dir fid+name */ if (fanotify_event_has_dir2_fh(event)) { info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME; ret = copy_fid_info_to_user(fanotify_event_fsid(event), fanotify_info_dir2_fh(info), info_type, fanotify_info_name2(info), info->name2_len, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } if (fanotify_event_has_object_fh(event)) { const char *dot = NULL; int dot_len = 0; if (fid_mode == FAN_REPORT_FID || info_type) { /* * With only group flag FAN_REPORT_FID only type FID is * reported. Second info record type is always FID. */ info_type = FAN_EVENT_INFO_TYPE_FID; } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not * recorded in an event on a directory, report the name * "." with info type DFID_NAME. */ info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; dot = "."; dot_len = 1; } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_DIR_FID, a single info * record has type DFID for directory entry modification * event and for event on a directory. */ info_type = FAN_EVENT_INFO_TYPE_DFID; } else { /* * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, * a single info record has type FID for event on a * non-directory, when there is no directory to report. * For example, on FAN_DELETE_SELF event. */ info_type = FAN_EVENT_INFO_TYPE_FID; } ret = copy_fid_info_to_user(fanotify_event_fsid(event), fanotify_event_object_fh(event), info_type, dot, dot_len, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } if (pidfd_mode) { ret = copy_pidfd_info_to_user(pidfd, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } if (fanotify_is_error_event(event->mask)) { ret = copy_error_info_to_user(event, buf, count); if (ret < 0) return ret; buf += ret; count -= ret; total_bytes += ret; } return total_bytes; } static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event *event, char __user *buf, size_t count) { struct fanotify_event_metadata metadata; const struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; struct file *f = NULL, *pidfd_file = NULL; int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, event); metadata.event_len = fanotify_event_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); /* * For an unprivileged listener, event->pid can be used to identify the * events generated by the listener process itself, without disclosing * the pids of other processes. */ if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && task_tgid(current) != event->pid) metadata.pid = 0; /* * For now, fid mode is required for an unprivileged listener and * fid mode does not report fd in events. Keep this check anyway * for safety in case fid mode requirement is relaxed in the future * to allow unprivileged listener to get events with no fd and no fid. */ if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && path && path->mnt && path->dentry) { fd = create_fd(group, path, &f); if (fd < 0) return fd; } metadata.fd = fd; if (pidfd_mode) { /* * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual * exclusion is ever lifted. At the time of incoporating pidfd * support within fanotify, the pidfd API only supported the * creation of pidfds for thread-group leaders. */ WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); /* * The PIDTYPE_TGID check for an event->pid is performed * preemptively in an attempt to catch out cases where the event * listener reads events after the event generating process has * already terminated. Report FAN_NOPIDFD to the event listener * in those cases, with all other pidfd creation errors being * reported as FAN_EPIDFD. */ if (metadata.pid == 0 || !pid_has_task(event->pid, PIDTYPE_TGID)) { pidfd = FAN_NOPIDFD; } else { pidfd = pidfd_prepare(event->pid, 0, &pidfd_file); if (pidfd < 0) pidfd = FAN_EPIDFD; } } ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and * event_len sizes ever get out of sync. */ if (WARN_ON_ONCE(metadata.event_len > count)) goto out_close_fd; if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd; buf += FAN_EVENT_METADATA_LEN; count -= FAN_EVENT_METADATA_LEN; if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->fd = fd; if (info_mode) { ret = copy_info_records_to_user(event, info, info_mode, pidfd, buf, count); if (ret < 0) goto out_close_fd; } if (f) fd_install(fd, f); if (pidfd_file) fd_install(pidfd, pidfd_file); return metadata.event_len; out_close_fd: if (fd != FAN_NOFD) { put_unused_fd(fd); fput(f); } if (pidfd >= 0) { put_unused_fd(pidfd); fput(pidfd_file); } return ret; } /* intofiy userspace file descriptor functions */ static __poll_t fanotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; __poll_t ret = 0; poll_wait(file, &group->notification_waitq, wait); spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = EPOLLIN | EPOLLRDNORM; spin_unlock(&group->notification_lock); return ret; } static ssize_t fanotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct fsnotify_group *group; struct fanotify_event *event; char __user *start; int ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; pr_debug("%s: group=%p\n", __func__, group); add_wait_queue(&group->notification_waitq, &wait); while (1) { /* * User can supply arbitrarily large buffer. Avoid softlockups * in case there are lots of available events. */ cond_resched(); event = get_one_event(group, count); if (IS_ERR(event)) { ret = PTR_ERR(event); break; } if (!event) { ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; if (start != buf) break; wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); continue; } ret = copy_event_to_user(group, event, buf, count); if (unlikely(ret == -EOPENSTALE)) { /* * We cannot report events with stale fd so drop it. * Setting ret to 0 will continue the event loop and * do the right thing if there are no more events to * read (i.e. return bytes read, -EAGAIN or wait). */ ret = 0; } /* * Permission events get queued to wait for response. Other * events can be destroyed now. */ if (!fanotify_is_perm_event(event->mask)) { fsnotify_destroy_event(group, &event->fse); } else { if (ret <= 0) { spin_lock(&group->notification_lock); finish_permission_event(group, FANOTIFY_PERM(event), FAN_DENY, NULL); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); list_add_tail(&event->fse.list, &group->fanotify_data.access_list); spin_unlock(&group->notification_lock); } } if (ret < 0) break; buf += ret; count -= ret; } remove_wait_queue(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; } static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { struct fanotify_response response; struct fsnotify_group *group; int ret; const char __user *info_buf = buf + sizeof(struct fanotify_response); size_t info_len; if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) return -EINVAL; group = file->private_data; pr_debug("%s: group=%p count=%zu\n", __func__, group, count); if (count < sizeof(response)) return -EINVAL; if (copy_from_user(&response, buf, sizeof(response))) return -EFAULT; info_len = count - sizeof(response); ret = process_access_response(group, &response, info_buf, info_len); if (ret < 0) count = ret; else count = sizeof(response) + ret; return count; } static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; struct fsnotify_event *fsn_event; /* * Stop new events from arriving in the notification queue. since * userspace cannot use fanotify fd anymore, no event can enter or * leave access_list by now either. */ fsnotify_group_stop_queueing(group); /* * Process all permission events on access_list and notification queue * and simulate reply from userspace. */ spin_lock(&group->notification_lock); while (!list_empty(&group->fanotify_data.access_list)) { struct fanotify_perm_event *event; event = list_first_entry(&group->fanotify_data.access_list, struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); finish_permission_event(group, event, FAN_ALLOW, NULL); spin_lock(&group->notification_lock); } /* * Destroy all non-permission events. For permission events just * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ while ((fsn_event = fsnotify_remove_first_event(group))) { struct fanotify_event *event = FANOTIFY_E(fsn_event); if (!(event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); } else { finish_permission_event(group, FANOTIFY_PERM(event), FAN_ALLOW, NULL); } spin_lock(&group->notification_lock); } spin_unlock(&group->notification_lock); /* Response for all permission events it set, wakeup waiters */ wake_up(&group->fanotify_data.access_waitq); /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_destroy_group(group); return 0; } static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; group = file->private_data; p = (void __user *) arg; switch (cmd) { case FIONREAD: spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) send_len += FAN_EVENT_METADATA_LEN; spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; } return ret; } static const struct file_operations fanotify_fops = { .show_fdinfo = fanotify_show_fdinfo, .poll = fanotify_poll, .read = fanotify_read, .write = fanotify_write, .fasync = NULL, .release = fanotify_release, .unlocked_ioctl = fanotify_ioctl, .compat_ioctl = compat_ptr_ioctl, .llseek = noop_llseek, }; static int fanotify_find_path(int dfd, const char __user *filename, struct path *path, unsigned int flags, __u64 mask, unsigned int obj_type) { int ret; pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__, dfd, filename, flags); if (filename == NULL) { struct fd f = fdget(dfd); ret = -EBADF; if (!f.file) goto out; ret = -ENOTDIR; if ((flags & FAN_MARK_ONLYDIR) && !(S_ISDIR(file_inode(f.file)->i_mode))) { fdput(f); goto out; } *path = f.file->f_path; path_get(path); fdput(f); } else { unsigned int lookup_flags = 0; if (!(flags & FAN_MARK_DONT_FOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flags & FAN_MARK_ONLYDIR) lookup_flags |= LOOKUP_DIRECTORY; ret = user_path_at(dfd, filename, lookup_flags, path); if (ret) goto out; } /* you can only watch an inode if you have read permissions on it */ ret = path_permission(path, MAY_READ); if (ret) { path_put(path); goto out; } ret = security_path_notify(path, mask, obj_type); if (ret) path_put(path); out: return ret; } static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int flags, __u32 umask, int *destroy) { __u32 oldmask, newmask; /* umask bits cannot be removed by user */ mask &= ~umask; spin_lock(&fsn_mark->lock); oldmask = fsnotify_calc_mask(fsn_mark); if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { fsn_mark->mask &= ~mask; } else { fsn_mark->ignore_mask &= ~mask; } newmask = fsnotify_calc_mask(fsn_mark); /* * We need to keep the mark around even if remaining mask cannot * result in any events (e.g. mask == FAN_ONDIR) to support incremenal * changes to the mask. * Destroy mark when only umask bits remain. */ *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); spin_unlock(&fsn_mark->lock); return oldmask & ~newmask; } static int fanotify_remove_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, __u32 mask, unsigned int flags, __u32 umask) { struct fsnotify_mark *fsn_mark = NULL; __u32 removed; int destroy_mark; fsnotify_group_lock(group); fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { fsnotify_group_unlock(group); return -ENOENT; } removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, umask, &destroy_mark); if (removed & fsnotify_conn_mask(fsn_mark->connector)) fsnotify_recalc_mask(fsn_mark->connector); if (destroy_mark) fsnotify_detach_mark(fsn_mark); fsnotify_group_unlock(group); if (destroy_mark) fsnotify_free_mark(fsn_mark); /* matches the fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); return 0; } static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, unsigned int flags, __u32 umask) { return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, mask, flags, umask); } static int fanotify_remove_sb_mark(struct fsnotify_group *group, struct super_block *sb, __u32 mask, unsigned int flags, __u32 umask) { return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags, umask); } static int fanotify_remove_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags, __u32 umask) { return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask, flags, umask); } static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; bool recalc = false; /* * When using FAN_MARK_IGNORE for the first time, mark starts using * independent event flags in ignore mask. After that, trying to * update the ignore mask with the old FAN_MARK_IGNORED_MASK API * will result in EEXIST error. */ if (ignore == FAN_MARK_IGNORE) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; /* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set * because of an ignore mask that is now going to survive FS_MODIFY. */ if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY)) recalc = true; } if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE || want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) return recalc; /* * NO_IREF may be removed from a mark, but not added. * When removed, fsnotify_recalc_mask() will take the inode ref. */ WARN_ON_ONCE(!want_iref); fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF; return true; } static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int fan_flags) { bool recalc; spin_lock(&fsn_mark->lock); if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) fsn_mark->mask |= mask; else fsn_mark->ignore_mask |= mask; recalc = fsnotify_calc_mask(fsn_mark) & ~fsnotify_conn_mask(fsn_mark->connector); recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags); spin_unlock(&fsn_mark->lock); return recalc; } static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, unsigned int fan_flags, __kernel_fsid_t *fsid) { struct ucounts *ucounts = group->fanotify_data.ucounts; struct fsnotify_mark *mark; int ret; /* * Enforce per user marks limits per user in all containing user ns. * A group with FAN_UNLIMITED_MARKS does not contribute to mark count * in the limited groups account. */ if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); if (!mark) { ret = -ENOMEM; goto out_dec_ucounts; } fsnotify_init_mark(mark, group); if (fan_flags & FAN_MARK_EVICTABLE) mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid); if (ret) { fsnotify_put_mark(mark); goto out_dec_ucounts; } return mark; out_dec_ucounts: if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); return ERR_PTR(ret); } static int fanotify_group_init_error_pool(struct fsnotify_group *group) { if (mempool_initialized(&group->fanotify_data.error_events_pool)) return 0; return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, FANOTIFY_DEFAULT_FEE_POOL_SIZE, sizeof(struct fanotify_error_event)); } static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, unsigned int fan_flags) { /* * Non evictable mark cannot be downgraded to evictable mark. */ if (fan_flags & FAN_MARK_EVICTABLE && !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) return -EEXIST; /* * New ignore mask semantics cannot be downgraded to old semantics. */ if (fan_flags & FAN_MARK_IGNORED_MASK && fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) return -EEXIST; /* * An ignore mask that survives modify could never be downgraded to not * survive modify. With new FAN_MARK_IGNORE semantics we make that rule * explicit and return an error when trying to update the ignore mask * without the original FAN_MARK_IGNORED_SURV_MODIFY value. */ if (fan_flags & FAN_MARK_IGNORE && !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) return -EEXIST; return 0; } static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int obj_type, __u32 mask, unsigned int fan_flags, __kernel_fsid_t *fsid) { struct fsnotify_mark *fsn_mark; bool recalc; int ret = 0; fsnotify_group_lock(group); fsn_mark = fsnotify_find_mark(connp, group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fan_flags, fsid); if (IS_ERR(fsn_mark)) { fsnotify_group_unlock(group); return PTR_ERR(fsn_mark); } } /* * Check if requested mark flags conflict with an existing mark flags. */ ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags); if (ret) goto out; /* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested). */ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && (mask & FAN_FS_ERROR)) { ret = fanotify_group_init_error_pool(group); if (ret) goto out; } recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags); if (recalc) fsnotify_recalc_mask(fsn_mark->connector); out: fsnotify_group_unlock(group); fsnotify_put_mark(fsn_mark); return ret; } static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt, __u32 mask, unsigned int flags, __kernel_fsid_t *fsid) { return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks, FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid); } static int fanotify_add_sb_mark(struct fsnotify_group *group, struct super_block *sb, __u32 mask, unsigned int flags, __kernel_fsid_t *fsid) { return fanotify_add_mark(group, &sb->s_fsnotify_marks, FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid); } static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags, __kernel_fsid_t *fsid) { pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); /* * If some other task has this inode open for write we should not add * an ignore mask, unless that ignore mask is supposed to survive * modification changes anyway. */ if ((flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && inode_is_open_for_write(inode)) return 0; return fanotify_add_mark(group, &inode->i_fsnotify_marks, FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid); } static struct fsnotify_event *fanotify_alloc_overflow_event(void) { struct fanotify_event *oevent; oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT); if (!oevent) return NULL; fanotify_init_event(oevent, 0, FS_Q_OVERFLOW); oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW; return &oevent->fse; } static struct hlist_head *fanotify_alloc_merge_hash(void) { struct hlist_head *hash; hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, GFP_KERNEL_ACCOUNT); if (!hash) return NULL; __hash_init(hash, FANOTIFY_HTABLE_SIZE); return hash; } /* fanotify syscalls */ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { struct fsnotify_group *group; int f_flags, fd; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; unsigned int internal_flags = 0; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); if (!capable(CAP_SYS_ADMIN)) { /* * An unprivileged user can setup an fanotify group with * limited functionality - an unprivileged group is limited to * notification events with file handles and it cannot use * unlimited queue/marks. */ if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode) return -EPERM; /* * Setting the internal flag FANOTIFY_UNPRIV on the group * prevents setting mount/filesystem marks on this group and * prevents reporting pid and open fd in events. */ internal_flags |= FANOTIFY_UNPRIV; } #ifdef CONFIG_AUDITSYSCALL if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) #else if (flags & ~FANOTIFY_INIT_FLAGS) #endif return -EINVAL; /* * A pidfd can only be returned for a thread-group leader; thus * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually * exclusive. */ if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) return -EINVAL; if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; switch (event_f_flags & O_ACCMODE) { case O_RDONLY: case O_RDWR: case O_WRONLY: break; default: return -EINVAL; } if (fid_mode && class != FAN_CLASS_NOTIF) return -EINVAL; /* * Child name is reported with parent fid so requires dir fid. * We can report both child fid and dir fid with or without name. */ if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) return -EINVAL; /* * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID * and is used as an indication to report both dir and child fid on all * dirent events. */ if ((fid_mode & FAN_REPORT_TARGET_FID) && (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) return -EINVAL; f_flags = O_RDWR | __FMODE_NONOTIFY; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; if (flags & FAN_NONBLOCK) f_flags |= O_NONBLOCK; /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ group = fsnotify_alloc_group(&fanotify_fsnotify_ops, FSNOTIFY_GROUP_USER | FSNOTIFY_GROUP_NOFS); if (IS_ERR(group)) { return PTR_ERR(group); } /* Enforce groups limits per user in all containing user ns */ group->fanotify_data.ucounts = inc_ucount(current_user_ns(), current_euid(), UCOUNT_FANOTIFY_GROUPS); if (!group->fanotify_data.ucounts) { fd = -EMFILE; goto out_destroy_group; } group->fanotify_data.flags = flags | internal_flags; group->memcg = get_mem_cgroup_from_mm(current->mm); group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); if (!group->fanotify_data.merge_hash) { fd = -ENOMEM; goto out_destroy_group; } group->overflow_event = fanotify_alloc_overflow_event(); if (unlikely(!group->overflow_event)) { fd = -ENOMEM; goto out_destroy_group; } if (force_o_largefile()) event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); switch (class) { case FAN_CLASS_NOTIF: group->priority = FS_PRIO_0; break; case FAN_CLASS_CONTENT: group->priority = FS_PRIO_1; break; case FAN_CLASS_PRE_CONTENT: group->priority = FS_PRIO_2; break; default: fd = -EINVAL; goto out_destroy_group; } if (flags & FAN_UNLIMITED_QUEUE) { fd = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto out_destroy_group; group->max_events = UINT_MAX; } else { group->max_events = fanotify_max_queued_events; } if (flags & FAN_UNLIMITED_MARKS) { fd = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto out_destroy_group; } if (flags & FAN_ENABLE_AUDIT) { fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) goto out_destroy_group; } fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); if (fd < 0) goto out_destroy_group; return fd; out_destroy_group: fsnotify_destroy_group(group); return fd; } static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) { __kernel_fsid_t root_fsid; int err; /* * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ err = vfs_get_fsid(dentry, fsid); if (err) return err; if (!fsid->val[0] && !fsid->val[1]) return -ENODEV; /* * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) * which uses a different fsid than sb root. */ err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); if (err) return err; if (root_fsid.val[0] != fsid->val[0] || root_fsid.val[1] != fsid->val[1]) return -EXDEV; return 0; } /* Check if filesystem can encode a unique fid */ static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; const struct export_operations *nop = dentry->d_sb->s_export_op; /* * We need to make sure that the filesystem supports encoding of * file handles so user can use name_to_handle_at() to compare fids * reported with events to the file handle of watched objects. */ if (!exportfs_can_encode_fid(nop)) return -EOPNOTSUPP; /* * For sb/mount mark, we also need to make sure that the filesystem * supports decoding file handles, so user has a way to map back the * reported fids to filesystem objects. */ if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop)) return -EOPNOTSUPP; return 0; } static int fanotify_events_supported(struct fsnotify_group *group, const struct path *path, __u64 mask, unsigned int flags) { unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || (mask & FAN_RENAME) || (flags & FAN_MARK_IGNORE); /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of * deadlocking the system - open done when reporting fanotify event * blocks on this "unusual" lock while another process holding the lock * waits for fanotify permission event to be answered. Just disallow * permission events for such filesystems. */ if (mask & FANOTIFY_PERM_EVENTS && path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) return -EINVAL; /* * mount and sb marks are not allowed on kernel internal pseudo fs, * like pipe_mnt, because that would subscribe to events on all the * anonynous pipes in the system. * * SB_NOUSER covers all of the internal pseudo fs whose objects are not * exposed to user's mount namespace, but there are other SB_KERNMOUNT * fs, like nsfs, debugfs, for which the value of allowing sb and mount * mark is questionable. For now we leave them alone. */ if (mark_type != FAN_MARK_INODE && path->mnt->mnt_sb->s_flags & SB_NOUSER) return -EINVAL; /* * We shouldn't have allowed setting dirent events and the directory * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, * but because we always allowed it, error only when using new APIs. */ if (strict_dir_events && mark_type == FAN_MARK_INODE && !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) return -ENOTDIR; return 0; } static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { struct inode *inode = NULL; struct vfsmount *mnt = NULL; struct fsnotify_group *group; struct fd f; struct path path; __kernel_fsid_t __fsid, *fsid = NULL; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; u32 umask = 0; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", __func__, fanotify_fd, flags, dfd, pathname, mask); /* we only use the lower 32 bits as of right now. */ if (upper_32_bits(mask)) return -EINVAL; if (flags & ~FANOTIFY_MARK_FLAGS) return -EINVAL; switch (mark_type) { case FAN_MARK_INODE: obj_type = FSNOTIFY_OBJ_TYPE_INODE; break; case FAN_MARK_MOUNT: obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; break; case FAN_MARK_FILESYSTEM: obj_type = FSNOTIFY_OBJ_TYPE_SB; break; default: return -EINVAL; } switch (mark_cmd) { case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) return -EINVAL; break; case FAN_MARK_FLUSH: if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) return -EINVAL; break; default: return -EINVAL; } if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) valid_mask |= FANOTIFY_PERM_EVENTS; if (mask & ~valid_mask) return -EINVAL; /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) return -EINVAL; /* * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with * FAN_MARK_IGNORED_MASK. */ if (ignore == FAN_MARK_IGNORED_MASK) { mask &= ~FANOTIFY_EVENT_FLAGS; umask = FANOTIFY_EVENT_FLAGS; } f = fdget(fanotify_fd); if (unlikely(!f.file)) return -EBADF; /* verify that this is indeed an fanotify instance */ ret = -EINVAL; if (unlikely(f.file->f_op != &fanotify_fops)) goto fput_and_out; group = f.file->private_data; /* * An unprivileged user is not allowed to setup mount nor filesystem * marks. This also includes setting up such marks by a group that * was initialized by an unprivileged user. */ ret = -EPERM; if ((!capable(CAP_SYS_ADMIN) || FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) && mark_type != FAN_MARK_INODE) goto fput_and_out; /* * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not * allowed to set permissions events. */ ret = -EINVAL; if (mask & FANOTIFY_PERM_EVENTS && group->priority == FS_PRIO_0) goto fput_and_out; if (mask & FAN_FS_ERROR && mark_type != FAN_MARK_FILESYSTEM) goto fput_and_out; /* * Evictable is only relevant for inode marks, because only inode object * can be evicted on memory pressure. */ if (flags & FAN_MARK_EVICTABLE && mark_type != FAN_MARK_INODE) goto fput_and_out; /* * Events that do not carry enough information to report * event->fd require a group that supports reporting fid. Those * events are not supported on a mount mark, because they do not * carry enough information (i.e. path) to be filtered by mount * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) goto fput_and_out; /* * FAN_RENAME uses special info type records to report the old and * new parent+name. Reporting only old and new parent id is less * useful and was not implemented. */ if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) goto fput_and_out; if (mark_cmd == FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); else if (mark_type == FAN_MARK_FILESYSTEM) fsnotify_clear_sb_marks_by_group(group); else fsnotify_clear_inode_marks_by_group(group); goto fput_and_out; } ret = fanotify_find_path(dfd, pathname, &path, flags, (mask & ALL_FSNOTIFY_EVENTS), obj_type); if (ret) goto fput_and_out; if (mark_cmd == FAN_MARK_ADD) { ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; } if (fid_mode) { ret = fanotify_test_fsid(path.dentry, &__fsid); if (ret) goto path_put_and_out; ret = fanotify_test_fid(path.dentry, flags); if (ret) goto path_put_and_out; fsid = &__fsid; } /* inode held in place by reference to path; group by fget on fd */ if (mark_type == FAN_MARK_INODE) inode = path.dentry->d_inode; else mnt = path.mnt; ret = mnt ? -EINVAL : -EISDIR; /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE && (mnt || S_ISDIR(inode->i_mode)) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) goto path_put_and_out; /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; umask = FAN_EVENT_ON_CHILD; /* * If group needs to report parent fid, register for getting * events with parent/name info for non-directory. */ if ((fid_mode & FAN_REPORT_DIR_FID) && (flags & FAN_MARK_ADD) && !ignore) mask |= FAN_EVENT_ON_CHILD; } /* create/update an inode mark */ switch (mark_cmd) { case FAN_MARK_ADD: if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags, fsid); else if (mark_type == FAN_MARK_FILESYSTEM) ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags, fsid); else ret = fanotify_add_inode_mark(group, inode, mask, flags, fsid); break; case FAN_MARK_REMOVE: if (mark_type == FAN_MARK_MOUNT) ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags, umask); else if (mark_type == FAN_MARK_FILESYSTEM) ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags, umask); else ret = fanotify_remove_inode_mark(group, inode, mask, flags, umask); break; default: ret = -EINVAL; } path_put_and_out: path_put(&path); fput_and_out: fdput(f); return ret; } #ifndef CONFIG_ARCH_SPLIT_ARG64 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, __u64, mask, int, dfd, const char __user *, pathname) { return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); } #endif #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) SYSCALL32_DEFINE6(fanotify_mark, int, fanotify_fd, unsigned int, flags, SC_ARG64(mask), int, dfd, const char __user *, pathname) { return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), dfd, pathname); } #endif /* * fanotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ static int __init fanotify_user_setup(void) { struct sysinfo si; int max_marks; si_meminfo(&si); /* * Allow up to 1% of addressable memory to be accounted for per user * marks limited to the range [8192, 1048576]. mount and sb marks are * a lot cheaper than inode marks, but there is no reason for a user * to have many of those, so calculate by the cost of inode marks. */ max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / INODE_MARK_COST; max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, SLAB_PANIC); fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event, SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = FANOTIFY_DEFAULT_MAX_GROUPS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; fanotify_sysctls_init(); return 0; } device_initcall(fanotify_user_setup);
4 1 1 4 3 3 3 1 4 7 7 7 7 6 4 3 4 3 6 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 1 1 3 4 3 3 3 3 2 3 1 3 3 2 1 1 3 3 1 1 6 7 6 6 3 3 3 1 2 2 1 3 3 3 3 3 3 3 3 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 */ /* * jfs_dtree.c: directory B+-tree manager * * B+-tree with variable length key directory: * * each directory page is structured as an array of 32-byte * directory entry slots initialized as a freelist * to avoid search/compaction of free space at insertion. * when an entry is inserted, a number of slots are allocated * from the freelist as required to store variable length data * of the entry; when the entry is deleted, slots of the entry * are returned to freelist. * * leaf entry stores full name as key and file serial number * (aka inode number) as data. * internal/router entry stores sufffix compressed name * as key and simple extent descriptor as data. * * each directory page maintains a sorted entry index table * which stores the start slot index of sorted entries * to allow binary search on the table. * * directory starts as a root/leaf page in on-disk inode * inline data area. * when it becomes full, it starts a leaf of a external extent * of length of 1 block. each time the first leaf becomes full, * it is extended rather than split (its size is doubled), * until its length becoms 4 KBytes, from then the extent is split * with new 4 Kbyte extent when it becomes full * to reduce external fragmentation of small directories. * * blah, blah, blah, for linear scan of directory in pieces by * readdir(). * * * case-insensitive directory file system * * names are stored in case-sensitive way in leaf entry. * but stored, searched and compared in case-insensitive (uppercase) order * (i.e., both search key and entry key are folded for search/compare): * (note that case-sensitive order is BROKEN in storage, e.g., * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad * * entries which folds to the same key makes up a equivalent class * whose members are stored as contiguous cluster (may cross page boundary) * but whose order is arbitrary and acts as duplicate, e.g., * abc, Abc, aBc, abC) * * once match is found at leaf, requires scan forward/backward * either for, in case-insensitive search, duplicate * or for, in case-sensitive search, for exact match * * router entry must be created/stored in case-insensitive way * in internal entry: * (right most key of left page and left most key of right page * are folded, and its suffix compression is propagated as router * key in parent) * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB> * should be made the router key for the split) * * case-insensitive search: * * fold search key; * * case-insensitive search of B-tree: * for internal entry, router key is already folded; * for leaf entry, fold the entry key before comparison. * * if (leaf entry case-insensitive match found) * if (next entry satisfies case-insensitive match) * return EDUPLICATE; * if (prev entry satisfies case-insensitive match) * return EDUPLICATE; * return match; * else * return no match; * * serialization: * target directory inode lock is being held on entry/exit * of all main directory service routines. * * log based recovery: */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dmap.h" #include "jfs_unicode.h" #include "jfs_debug.h" /* dtree split parameter */ struct dtsplit { struct metapage *mp; s16 index; s16 nslot; struct component_name *key; ddata_t *data; struct pxdlist *pxdlist; }; #define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) /* get page buffer for specified block address */ #define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ do { \ BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ MP = NULL; \ RC = -EIO; \ } \ } \ } while (0) /* for consistency */ #define DT_PUTPAGE(MP) BT_PUTPAGE(MP) #define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) /* * forward references */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack); static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp); static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack); static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); static int dtReadFirst(struct inode *ip, struct btstack * btstack); static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack); static int dtCompare(struct component_name * key, dtpage_t * p, int si); static int ciCompare(struct component_name * key, dtpage_t * p, int si, int flag); static void dtGetKey(dtpage_t * p, int i, struct component_name * key, int flag); static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag); static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock **); static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index); static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); #define ciToUpper(c) UniStrupr((c)->name) /* * read_index_page() * * Reads a page of a directory's index table. * Having metadata mapped into the directory inode's address space * presents a multitude of problems. We avoid this by mapping to * the absolute address space outside of the *_metapage routines */ static struct metapage *read_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return read_metapage(inode, xaddr, PSIZE, 1); } /* * get_index_page() * * Same as get_index_page(), but get's a new page without reading */ static struct metapage *get_index_page(struct inode *inode, s64 blkno) { int rc; s64 xaddr; int xflag; s32 xlen; rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); if (rc || (xaddr == 0)) return NULL; return get_metapage(inode, xaddr, PSIZE, 1); } /* * find_index() * * Returns dtree page containing directory table entry for specified * index and pointer to its entry. * * mp must be released by caller. */ static struct dir_table_slot *find_index(struct inode *ip, u32 index, struct metapage ** mp, s64 *lblock) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); s64 blkno; s64 offset; int page_offset; struct dir_table_slot *slot; static int maxWarnings = 10; if (index < 2) { if (maxWarnings) { jfs_warn("find_entry called with index = %d", index); maxWarnings--; } return NULL; } if (index >= jfs_ip->next_index) { jfs_warn("find_entry called with index >= next_index"); return NULL; } if (jfs_dirtable_inline(ip)) { /* * Inline directory table */ *mp = NULL; slot = &jfs_ip->i_dirtable[index - 2]; } else { offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << JFS_SBI(ip->i_sb)->l2nbperpage; if (*mp && (*lblock != blkno)) { release_metapage(*mp); *mp = NULL; } if (!(*mp)) { *lblock = blkno; *mp = read_index_page(ip, blkno); } if (!(*mp)) { jfs_err("free_index: error reading directory table"); return NULL; } slot = (struct dir_table_slot *) ((char *) (*mp)->data + page_offset); } return slot; } static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, u32 index) { struct tlock *tlck; struct linelock *llck; struct lv *lv; tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) tlck->lock; if (llck->index >= llck->maxcnt) llck = txLinelock(llck); lv = &llck->lv[llck->index]; /* * Linelock slot size is twice the size of directory table * slot size. 512 entries per page. */ lv->offset = ((index - 2) & 511) >> 1; lv->length = 1; llck->index++; } /* * add_index() * * Adds an entry to the directory index table. This is used to provide * each directory entry with a persistent index in which to resume * directory traversals */ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) { struct super_block *sb = ip->i_sb; struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); u64 blkno; struct dir_table_slot *dirtab_slot; u32 index; struct linelock *llck; struct lv *lv; struct metapage *mp; s64 offset; uint page_offset; struct tlock *tlck; s64 xaddr; ASSERT(DO_INDEX(ip)); if (jfs_ip->next_index < 2) { jfs_warn("add_index: next_index = %d. Resetting!", jfs_ip->next_index); jfs_ip->next_index = 2; } index = jfs_ip->next_index++; if (index <= MAX_INLINE_DIRTABLE_ENTRY) { /* * i_size reflects size of index table, or 8 bytes per entry. */ ip->i_size = (loff_t) (index - 1) << 3; /* * dir table fits inline within inode */ dirtab_slot = &jfs_ip->i_dirtable[index-2]; dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); set_cflag(COMMIT_Dirtable, ip); return index; } if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { struct dir_table_slot temp_table[12]; /* * It's time to move the inline table to an external * page and begin to build the xtree */ if (dquot_alloc_block(ip, sbi->nbperpage)) goto clean_up; if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { dquot_free_block(ip, sbi->nbperpage); goto clean_up; } /* * Save the table, we're going to overwrite it with the * xtree root */ memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); /* * Initialize empty x-tree */ xtInitRoot(tid, ip); /* * Add the first block to the xtree */ if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { /* This really shouldn't fail */ jfs_warn("add_index: xtInsert failed!"); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); dbFree(ip, xaddr, sbi->nbperpage); dquot_free_block(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; mp = get_index_page(ip, 0); if (!mp) { jfs_err("add_index: get_metapage failed!"); xtTruncate(tid, ip, 0, COMMIT_PWMAP); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); goto clean_up; } tlck = txLock(tid, ip, mp, tlckDATA); llck = (struct linelock *) & tlck->lock; ASSERT(llck->index == 0); lv = &llck->lv[0]; lv->offset = 0; lv->length = 6; /* tlckDATA slot size is 16 bytes */ llck->index++; memcpy(mp->data, temp_table, sizeof(temp_table)); mark_metapage_dirty(mp); release_metapage(mp); /* * Logging is now directed by xtree tlocks */ clear_cflag(COMMIT_Dirtable, ip); } offset = (index - 2) * sizeof(struct dir_table_slot); page_offset = offset & (PSIZE - 1); blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; if (page_offset == 0) { /* * This will be the beginning of a new page */ xaddr = 0; if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { jfs_warn("add_index: xtInsert failed!"); goto clean_up; } ip->i_size += PSIZE; if ((mp = get_index_page(ip, blkno))) memset(mp->data, 0, PSIZE); /* Just looks better */ else xtTruncate(tid, ip, offset, COMMIT_PWMAP); } else mp = read_index_page(ip, blkno); if (!mp) { jfs_err("add_index: get/read_metapage failed!"); goto clean_up; } lock_index(tid, ip, mp, index); dirtab_slot = (struct dir_table_slot *) ((char *) mp->data + page_offset); dirtab_slot->flag = DIR_INDEX_VALID; dirtab_slot->slot = slot; DTSaddress(dirtab_slot, bn); mark_metapage_dirty(mp); release_metapage(mp); return index; clean_up: jfs_ip->next_index--; return 0; } /* * free_index() * * Marks an entry to the directory index table as free. */ static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) { struct dir_table_slot *dirtab_slot; s64 lblock; struct metapage *mp = NULL; dirtab_slot = find_index(ip, index, &mp, &lblock); if (!dirtab_slot) return; dirtab_slot->flag = DIR_INDEX_FREE; dirtab_slot->slot = dirtab_slot->addr1 = 0; dirtab_slot->addr2 = cpu_to_le32(next); if (mp) { lock_index(tid, ip, mp, index); mark_metapage_dirty(mp); release_metapage(mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * modify_index() * * Changes an entry in the directory index table */ static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, int slot, struct metapage ** mp, s64 *lblock) { struct dir_table_slot *dirtab_slot; dirtab_slot = find_index(ip, index, mp, lblock); if (!dirtab_slot) return; DTSaddress(dirtab_slot, bn); dirtab_slot->slot = slot; if (*mp) { lock_index(tid, ip, *mp, index); mark_metapage_dirty(*mp); } else set_cflag(COMMIT_Dirtable, ip); } /* * read_index() * * reads a directory table slot */ static int read_index(struct inode *ip, u32 index, struct dir_table_slot * dirtab_slot) { s64 lblock; struct metapage *mp = NULL; struct dir_table_slot *slot; slot = find_index(ip, index, &mp, &lblock); if (!slot) { return -EIO; } memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); if (mp) release_metapage(mp); return 0; } /* * dtSearch() * * function: * Search for the entry with specified key * * parameter: * * return: 0 - search result on stack, leaf page pinned; * errno - I/O error */ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, struct btstack * btstack, int flag) { int rc = 0; int cmp = 1; /* init for empty page */ s64 bn; struct metapage *mp; dtpage_t *p; s8 *stbl; int base, index, lim; struct btframe *btsp; pxd_t *pxd; int psize = 288; /* initial in-line directory */ ino_t inumber; struct component_name ciKey; struct super_block *sb = ip->i_sb; ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_NOFS); if (!ciKey.name) { rc = -ENOMEM; goto dtSearch_Exit2; } /* uppercase search key for c-i directory */ UniStrcpy(ciKey.name, key->name); ciKey.namlen = key->namlen; /* only uppercase if case-insensitive support is on */ if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { ciToUpper(&ciKey); } BT_CLR(btstack); /* reset stack */ /* init level count for max pages to split */ btstack->nsplit = 1; /* * search down tree from root: * * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of * internal page, child page Pi contains entry with k, Ki <= K < Kj. * * if entry with search key K is not found * internal page search find the entry with largest key Ki * less than K which point to the child page to search; * leaf page search find the entry with smallest key Kj * greater than K so that the returned index is the position of * the entry to be shifted right for insertion of new entry. * for empty tree, search key is greater than any key of the tree. * * by convention, root bn = 0. */ for (bn = 0;;) { /* get/pin the page to search */ DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) goto dtSearch_Exit1; /* get sorted entry table of the page */ stbl = DT_GETSTBL(p); /* * binary search with search key K on the current page. */ for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { index = base + (lim >> 1); if (p->header.flag & BT_LEAF) { /* uppercase leaf name to compare */ cmp = ciCompare(&ciKey, p, stbl[index], JFS_SBI(sb)->mntflag); } else { /* router key is in uppercase */ cmp = dtCompare(&ciKey, p, stbl[index]); } if (cmp == 0) { /* * search hit */ /* search hit - leaf page: * return the entry found */ if (p->header.flag & BT_LEAF) { inumber = le32_to_cpu( ((struct ldtentry *) & p->slot[stbl[index]])->inumber); /* * search for JFS_LOOKUP */ if (flag == JFS_LOOKUP) { *data = inumber; rc = 0; goto out; } /* * search for JFS_CREATE */ if (flag == JFS_CREATE) { *data = inumber; rc = -EEXIST; goto out; } /* * search for JFS_REMOVE or JFS_RENAME */ if ((flag == JFS_REMOVE || flag == JFS_RENAME) && *data != inumber) { rc = -ESTALE; goto out; } /* * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME */ /* save search result */ *data = inumber; btsp = btstack->top; btsp->bn = bn; btsp->index = index; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* search hit - internal page: * descend/search its child page */ goto getChild; } if (cmp > 0) { base = index + 1; --lim; } } /* * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or (maxindex + 1) index. */ /* * search miss - leaf page * * return location of entry (base) where new entry with * search key K is to be inserted. */ if (p->header.flag & BT_LEAF) { /* * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME */ if (flag == JFS_LOOKUP || flag == JFS_REMOVE || flag == JFS_RENAME) { rc = -ENOENT; goto out; } /* * search for JFS_CREATE|JFS_FINDDIR: * * save search result */ *data = 0; btsp = btstack->top; btsp->bn = bn; btsp->index = base; btsp->mp = mp; rc = 0; goto dtSearch_Exit1; } /* * search miss - internal page * * if base is non-zero, decrement base by one to get the parent * entry of the child page to search. */ index = base ? base - 1 : base; /* * go down to child page */ getChild: /* update max. number of pages to split */ if (BT_STACK_FULL(btstack)) { /* Something's corrupted, mark filesystem dirty so * chkdsk will fix it. */ jfs_error(sb, "stack overrun!\n"); BT_STACK_DUMP(btstack); rc = -EIO; goto out; } btstack->nsplit++; /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, index); /* get the child page block number */ pxd = (pxd_t *) & p->slot[stbl[index]]; bn = addressPXD(pxd); psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } out: DT_PUTPAGE(mp); dtSearch_Exit1: kfree(ciKey.name); dtSearch_Exit2: return rc; } /* * dtInsert() * * function: insert an entry to directory tree * * parameter: * * return: 0 - success; * errno - failure; */ int dtInsert(tid_t tid, struct inode *ip, struct component_name * name, ino_t * fsn, struct btstack * btstack) { int rc = 0; struct metapage *mp; /* meta-page buffer */ dtpage_t *p; /* base B+-tree index page */ s64 bn; int index; struct dtsplit split; /* split information */ ddata_t data; struct dt_lock *dtlck; int n; struct tlock *tlck; struct lv *lv; /* * retrieve search result * * dtSearch() returns (leaf page pinned, index at which to insert). * n.b. dtSearch() may return index of (maxindex + 1) of * the full page. */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); /* * insert entry for new key */ if (DO_INDEX(ip)) { if (JFS_IP(ip)->next_index == DIREND) { DT_PUTPAGE(mp); return -EMLINK; } n = NDTLEAF(name->namlen); data.leaf.tid = tid; data.leaf.ip = ip; } else { n = NDTLEAF_LEGACY(name->namlen); data.leaf.ip = NULL; /* signifies legacy directory format */ } data.leaf.ino = *fsn; /* * leaf page does not have enough room for new entry: * * extend/split the leaf page; * * dtSplitUp() will insert the entry and unpin the leaf page. */ if (n > p->header.freecnt) { split.mp = mp; split.index = index; split.nslot = n; split.key = name; split.data = &data; rc = dtSplitUp(tid, ip, &split, btstack); return rc; } /* * leaf page does have enough room for new entry: * * insert the new data entry into the leaf page; */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; dtInsertEntry(p, index, name, &data, &dtlck); /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; n = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + n; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; } /* * dtSplitUp() * * function: propagate insertion bottom up; * * parameter: * * return: 0 - success; * errno - failure; * leaf page unpinned; */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); int rc = 0; struct metapage *smp; dtpage_t *sp; /* split page */ struct metapage *rmp; dtpage_t *rp; /* new right page split from sp */ pxd_t rpxd; /* new right page extent descriptor */ struct metapage *lmp; dtpage_t *lp; /* left child page */ int skip; /* index of entry of insertion */ struct btframe *parent; /* parent page entry on traverse stack */ s64 xaddr, nxaddr; int xlen, xsize; struct pxdlist pxdlist; pxd_t *pxd; struct component_name key = { 0, NULL }; ddata_t *data = split->data; int n; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int quota_allocation = 0; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS); if (!key.name) { DT_PUTPAGE(smp); rc = -ENOMEM; goto dtSplitUp_Exit; } /* * split leaf page * * The split routines insert the new entry, and * acquire txLock as appropriate. */ /* * split root leaf page: */ if (sp->header.flag & BT_ROOT) { /* * allocate a single extent child page */ xlen = 1; n = sbi->bsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ if (n <= split->nslot) xlen++; if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { DT_PUTPAGE(smp); goto freeKeyName; } pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); split->pxdlist = &pxdlist; rc = dtSplitRoot(tid, ip, split, &rmp); if (rc) dbFree(ip, xaddr, xlen); else DT_PUTPAGE(rmp); DT_PUTPAGE(smp); if (!DO_INDEX(ip)) ip->i_size = xlen << sbi->l2bsize; goto freeKeyName; } /* * extend first leaf page * * extend the 1st extent if less than buffer page size * (dtExtendPage() reurns leaf page unpinned) */ pxd = &sp->header.self; xlen = lengthPXD(pxd); xsize = xlen << sbi->l2bsize; if (xsize < PSIZE) { xaddr = addressPXD(pxd); n = xsize >> L2DTSLOTSIZE; n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ if ((n + sp->header.freecnt) <= split->nslot) n = xlen + (xlen << 1); else n = xlen; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, n); if (rc) goto extendOut; quota_allocation += n; if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, (s64) n, &nxaddr))) goto extendOut; pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; PXDaddress(pxd, nxaddr); PXDlength(pxd, xlen + n); split->pxdlist = &pxdlist; if ((rc = dtExtendPage(tid, ip, split, btstack))) { nxaddr = addressPXD(pxd); if (xaddr != nxaddr) { /* free relocated extent */ xlen = lengthPXD(pxd); dbFree(ip, nxaddr, (s64) xlen); } else { /* free extended delta */ xlen = lengthPXD(pxd) - n; xaddr = addressPXD(pxd) + xlen; dbFree(ip, xaddr, (s64) n); } } else if (!DO_INDEX(ip)) ip->i_size = lengthPXD(pxd) << sbi->l2bsize; extendOut: DT_PUTPAGE(smp); goto freeKeyName; } /* * split leaf page <sp> into <sp> and a new right page <rp>. * * return <rp> pinned and its extent descriptor <rpxd> */ /* * allocate new directory page extent and * new index page(s) to cover page split(s) * * allocation hint: ? */ n = btstack->nsplit; pxdlist.maxnpxd = pxdlist.npxd = 0; xlen = sbi->nbperpage; for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); pxdlist.maxnpxd++; continue; } DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } split->pxdlist = &pxdlist; if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { DT_PUTPAGE(smp); /* undo allocation */ goto splitOut; } if (!DO_INDEX(ip)) ip->i_size += PSIZE; /* * propagate up the router entry for the leaf page just split * * insert a router entry for the new page into the parent page, * propagate the insert/split up the tree by walking back the stack * of (bn of parent page, index of child page entry in parent page) * that were traversed during the search for the page that split. * * the propagation of insert/split up the tree stops if the root * splits or the page inserted into doesn't have to split to hold * the new entry. * * the parent entry for the split page remains the same, and * a new entry is inserted at its right with the first key and * block number of the new right page. * * There are a maximum of 4 pages pinned at any time: * two children, left parent and right parent (when the parent splits). * keep the child pages pinned while working on the parent. * make sure that all pins are released at exit. */ while ((parent = BT_POP(btstack)) != NULL) { /* parent page specified by stack frame <parent> */ /* keep current child pages (<lp>, <rp>) pinned */ lmp = smp; lp = sp; /* * insert router entry in parent for new right child page <rp> */ /* get the parent page <sp> */ DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); goto splitOut; } /* * The new key entry goes ONE AFTER the index of parent entry, * because the split was to the right. */ skip = parent->index + 1; /* * compute the key for the router entry * * key suffix compression: * for internal pages that have leaf pages as children, * retain only what's needed to distinguish between * the new entry and the entry on the page to its left. * If the keys compare equal, retain the entire key. * * note that compression is performed only at computing * router key at the lowest internal level. * further compression of the key between pairs of higher * level internal pages loses too much information and * the search may fail. * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} * results in two adjacent parent entries (a)(xx). * if split occurs between these two entries, and * if compression is applied, the router key of parent entry * of right page (x) will divert search for x into right * subtree and miss x in the left subtree.) * * the entire key must be retained for the next-to-leftmost * internal key at any level of the tree, or search may fail * (e.g., ?) */ switch (rp->header.flag & BT_TYPE) { case BT_LEAF: /* * compute the length of prefix for suffix compression * between last entry of left page and first entry * of right page */ if ((sp->header.flag & BT_ROOT && skip > 1) || sp->header.prev != 0 || skip > 1) { /* compute uppercase router prefix key */ rc = ciGetLeafPrefixKey(lp, lp->header.nextindex-1, rp, 0, &key, sbi->mntflag); if (rc) { DT_PUTPAGE(lmp); DT_PUTPAGE(rmp); DT_PUTPAGE(smp); goto splitOut; } } else { /* next to leftmost entry of lowest internal level */ /* compute uppercase router key */ dtGetKey(rp, 0, &key, sbi->mntflag); key.name[key.namlen] = 0; if ((sbi->mntflag & JFS_OS2) == JFS_OS2) ciToUpper(&key); } n = NDTINTERNAL(key.namlen); break; case BT_INTERNAL: dtGetKey(rp, 0, &key, sbi->mntflag); n = NDTINTERNAL(key.namlen); break; default: jfs_err("dtSplitUp(): UFO!"); break; } /* unpin left child page */ DT_PUTPAGE(lmp); /* * compute the data for the router entry */ data->xd = rpxd; /* child page xd */ /* * parent page is full - split the parent page */ if (n > sp->header.freecnt) { /* init for parent page split */ split->mp = smp; split->index = skip; /* index at insert */ split->nslot = n; split->key = &key; /* split->data = data; */ /* unpin right child page */ DT_PUTPAGE(rmp); /* The split routines insert the new entry, * acquire txLock as appropriate. * return <rp> pinned and its block number <rbn>. */ rc = (sp->header.flag & BT_ROOT) ? dtSplitRoot(tid, ip, split, &rmp) : dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); if (rc) { DT_PUTPAGE(smp); goto splitOut; } /* smp and rmp are pinned */ } /* * parent page is not full - insert router entry in parent page */ else { BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the parent page */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root parent page */ if (!(sp->header.flag & BT_ROOT)) { lv++; n = skip >> L2DTSLOTSIZE; lv->offset = sp->header.stblindex + n; lv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; dtlck->index++; } dtInsertEntry(sp, skip, &key, data, &dtlck); /* exit propagate up */ break; } } /* unpin current split and its right page */ DT_PUTPAGE(smp); DT_PUTPAGE(rmp); /* * free remaining extents allocated for split */ splitOut: n = pxdlist.npxd; pxd = &pxdlist.pxd[n]; for (; n < pxdlist.maxnpxd; n++, pxd++) dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); freeKeyName: kfree(key.name); /* Rollback quota allocation */ if (rc && quota_allocation) dquot_free_block(ip, quota_allocation); dtSplitUp_Exit: return rc; } /* * dtSplitPage() * * function: Split a non-root page of a btree. * * parameter: * * return: 0 - success; * errno - failure; * return split and new page pinned; */ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) { int rc = 0; struct metapage *smp; dtpage_t *sp; struct metapage *rmp; dtpage_t *rp; /* new right page allocated */ s64 rbn; /* new right page block number */ struct metapage *mp; dtpage_t *p; s64 nextbn; struct pxdlist *pxdlist; pxd_t *pxd; int skip, nextindex, half, left, nxt, off, si; struct ldtentry *ldtentry; struct idtentry *idtentry; u8 *stbl; struct dtslot *f; int fsi, stblsize; int n; struct dt_lock *sdtlck, *rdtlck; struct tlock *tlck; struct dt_lock *dtlck; struct lv *slv, *rlv, *lv; /* get split page */ smp = split->mp; sp = DT_PAGE(ip, smp); /* * allocate the new right page for the split */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); rmp = get_metapage(ip, rbn, PSIZE, 1); if (rmp == NULL) return -EIO; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); rdtlck = (struct dt_lock *) & tlck->lock; rp = (dtpage_t *) rmp->data; *rpp = rp; rp->header.self = *pxd; BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the split page * * action: */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); sdtlck = (struct dt_lock *) & tlck->lock; /* linelock header of split page */ ASSERT(sdtlck->index == 0); slv = & sdtlck->lv[0]; slv->offset = 0; slv->length = 1; sdtlck->index++; /* * initialize/update sibling pointers between sp and rp */ nextbn = le64_to_cpu(sp->header.next); rp->header.next = cpu_to_le64(nextbn); rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); sp->header.next = cpu_to_le64(rbn); /* * initialize new right page */ rp->header.flag = sp->header.flag; /* compute sorted entry table at start of extent data area */ rp->header.nextindex = 0; rp->header.stblindex = 1; n = PSIZE >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ /* init freelist */ fsi = rp->header.stblindex + stblsize; rp->header.freelist = fsi; rp->header.freecnt = rp->header.maxslot - fsi; /* * sequential append at tail: append without split * * If splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is * sequential. Adding an empty page on the side of the level is less * work and can push the fill factor much higher than normal. * If we're wrong it's no big deal, we'll just do the split the right * way next time. * (It may look like it's equally easy to do a similar hack for * reverse sorted data, that is, split the tree left, * but it's not. Be my guest.) */ if (nextbn == 0 && split->index == sp->header.nextindex) { /* linelock header + stbl (first slot) of new page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 2; rdtlck->index++; /* * initialize freelist of new right page */ f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* insert entry at the first entry of the new right page */ dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); goto out; } /* * non-sequential insert (at possibly middle page) */ /* * update prev pointer of previous right sibling page; */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) { discard_metapage(rmp); return rc; } BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header of previous right sibling page */ lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(rbn); DT_PUTPAGE(mp); } /* * split the data between the split and right pages. */ skip = split->index; half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ left = 0; /* * compute fill factor for split pages * * <nxt> traces the next entry to move to rp * <off> traces the next entry to stay in sp */ stbl = (u8 *) & sp->slot[sp->header.stblindex]; nextindex = sp->header.nextindex; for (nxt = off = 0; nxt < nextindex; ++off) { if (off == skip) /* check for fill factor with new entry size */ n = split->nslot; else { si = stbl[nxt]; switch (sp->header.flag & BT_TYPE) { case BT_LEAF: ldtentry = (struct ldtentry *) & sp->slot[si]; if (DO_INDEX(ip)) n = NDTLEAF(ldtentry->namlen); else n = NDTLEAF_LEGACY(ldtentry-> namlen); break; case BT_INTERNAL: idtentry = (struct idtentry *) & sp->slot[si]; n = NDTINTERNAL(idtentry->namlen); break; default: break; } ++nxt; /* advance to next entry to move in sp */ } left += n; if (left >= half) break; } /* <nxt> poins to the 1st entry to move */ /* * move entries to right page * * dtMoveEntry() initializes rp and reserves entry for insertion * * split page moved out entries are linelocked; * new/right page moved in entries are linelocked; */ /* linelock header + stbl of new right page */ rlv = & rdtlck->lv[rdtlck->index]; rlv->offset = 0; rlv->length = 5; rdtlck->index++; dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); sp->header.nextindex = nxt; /* * finalize freelist of new right page */ fsi = rp->header.freelist; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * the skipped index was on the left page, */ if (skip <= off) { /* insert the new entry in the split page */ dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); /* linelock stbl of split page */ if (sdtlck->index >= sdtlck->maxcnt) sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[sdtlck->index]; n = skip >> L2DTSLOTSIZE; slv->offset = sp->header.stblindex + n; slv->length = ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; sdtlck->index++; } /* * the skipped index was on the right page, */ else { /* adjust the skip index to reflect the new position */ skip -= nxt; /* insert the new entry in the right page */ dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); } out: *rmpp = rmp; *rpxdp = *pxd; return rc; } /* * dtExtendPage() * * function: extend 1st/only directory leaf page * * parameter: * * return: 0 - success; * errno - failure; * return extended page pinned; */ static int dtExtendPage(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) { struct super_block *sb = ip->i_sb; int rc; struct metapage *smp, *pmp, *mp; dtpage_t *sp, *pp; struct pxdlist *pxdlist; pxd_t *pxd, *tpxd; int xlen, xsize; int newstblindex, newstblsize; int oldstblindex, oldstblsize; int fsi, last; struct dtslot *f; struct btframe *parent; int n; struct dt_lock *dtlck; s64 xaddr, txaddr; struct tlock *tlck; struct pxd_lock *pxdlock; struct lv *lv; uint type; struct ldtentry *ldtentry; u8 *stbl; /* get page to extend */ smp = split->mp; sp = DT_PAGE(ip, smp); /* get parent/root page */ parent = BT_POP(btstack); DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); if (rc) return (rc); /* * extend the extent */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; xaddr = addressPXD(pxd); tpxd = &sp->header.self; txaddr = addressPXD(tpxd); /* in-place extension */ if (xaddr == txaddr) { type = tlckEXTEND; } /* relocation */ else { type = tlckNEW; /* save moved extent descriptor for later free */ tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = sp->header.self; pxdlock->index = 1; /* * Update directory index table to reflect new page address */ if (DO_INDEX(ip)) { s64 lblock; mp = NULL; stbl = DT_GETSTBL(sp); for (n = 0; n < sp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & sp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), xaddr, n, &mp, &lblock); } if (mp) release_metapage(mp); } } /* * extend the page */ sp->header.self = *pxd; jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the extended/leaf page */ tlck = txLock(tid, ip, smp, tlckDTREE | type); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[0]; /* update buffer extent descriptor of extended page */ xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; /* * copy old stbl to new stbl at start of extended area */ oldstblindex = sp->header.stblindex; oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; newstblindex = sp->header.maxslot; n = xsize >> L2DTSLOTSIZE; newstblsize = (n + 31) >> L2DTSLOTSIZE; memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], sp->header.nextindex); /* * in-line extension: linelock old area of extended page */ if (type == tlckEXTEND) { /* linelock header */ lv->offset = 0; lv->length = 1; dtlck->index++; lv++; /* linelock new stbl of extended page */ lv->offset = newstblindex; lv->length = newstblsize; } /* * relocation: linelock whole relocated area */ else { lv->offset = 0; lv->length = sp->header.maxslot + newstblsize; } dtlck->index++; sp->header.maxslot = n; sp->header.stblindex = newstblindex; /* sp->header.nextindex remains the same */ /* * add old stbl region at head of freelist */ fsi = oldstblindex; f = &sp->slot[fsi]; last = sp->header.freelist; for (n = 0; n < oldstblsize; n++, fsi++, f++) { f->next = last; last = fsi; } sp->header.freelist = last; sp->header.freecnt += oldstblsize; /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = newstblindex + newstblsize; f = &sp->slot[fsi]; for (fsi++; fsi < sp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) sp->header.freelist = n; else { do { f = &sp->slot[fsi]; fsi = f->next; } while (fsi != -1); f->next = n; } sp->header.freecnt += sp->header.maxslot - n; /* * insert the new entry */ dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); BT_MARK_DIRTY(pmp, ip); /* * linelock any freeslots residing in old extent */ if (type == tlckEXTEND) { n = sp->header.maxslot >> 2; if (sp->header.freelist < n) dtLinelockFreelist(sp, n, &dtlck); } /* * update parent entry on the parent/root page */ /* * acquire a transaction lock on the parent/root page */ tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; lv = & dtlck->lv[dtlck->index]; /* linelock parent entry - 1st slot */ lv->offset = 1; lv->length = 1; dtlck->index++; /* update the parent pxd for page extension */ tpxd = (pxd_t *) & pp->slot[1]; *tpxd = *pxd; DT_PUTPAGE(pmp); return 0; } /* * dtSplitRoot() * * function: * split the full root page into * original/root/split page and new right page * i.e., root remains fixed in tree anchor (inode) and * the root is copied to a single new right child page * since root page << non-root page, and * the split root page contains a single entry for the * new right child page. * * parameter: * * return: 0 - success; * errno - failure; * return new page pinned; */ static int dtSplitRoot(tid_t tid, struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) { struct super_block *sb = ip->i_sb; struct metapage *smp; dtroot_t *sp; struct metapage *rmp; dtpage_t *rp; s64 rbn; int xlen; int xsize; struct dtslot *f; s8 *stbl; int fsi, stblsize, n; struct idtentry *s; pxd_t *ppxd; struct pxdlist *pxdlist; pxd_t *pxd; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int rc; /* get split root page */ smp = split->mp; sp = &JFS_IP(ip)->i_dtroot; /* * allocate/initialize a single (right) child page * * N.B. at first split, a one (or two) block to fit new entry * is allocated; at subsequent split, a full page is allocated; */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); xlen = lengthPXD(pxd); xsize = xlen << JFS_SBI(sb)->l2bsize; rmp = get_metapage(ip, rbn, xsize, 1); if (!rmp) return -EIO; rp = rmp->data; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } BT_MARK_DIRTY(rmp, ip); /* * acquire a transaction lock on the new right page */ tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); dtlck = (struct dt_lock *) & tlck->lock; rp->header.flag = (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; rp->header.self = *pxd; /* initialize sibling pointers */ rp->header.next = 0; rp->header.prev = 0; /* * move in-line root page into new right page extent */ /* linelock header + copied entries + new stbl (1st slot) in new page */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = 10; /* 1 + 8 + 1 */ dtlck->index++; n = xsize >> L2DTSLOTSIZE; rp->header.maxslot = n; stblsize = (n + 31) >> L2DTSLOTSIZE; /* copy old stbl to new stbl at start of extended area */ rp->header.stblindex = DTROOTMAXSLOT; stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; memcpy(stbl, sp->header.stbl, sp->header.nextindex); rp->header.nextindex = sp->header.nextindex; /* copy old data area to start of new data area */ memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); /* * append free region of newly extended area at tail of freelist */ /* init free region of newly extended area */ fsi = n = DTROOTMAXSLOT + stblsize; f = &rp->slot[fsi]; for (fsi++; fsi < rp->header.maxslot; f++, fsi++) f->next = fsi; f->next = -1; /* append new free region at tail of old freelist */ fsi = sp->header.freelist; if (fsi == -1) rp->header.freelist = n; else { rp->header.freelist = fsi; do { f = &rp->slot[fsi]; fsi = f->next; } while (fsi != -1); f->next = n; } rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; /* * Update directory index table for entries now in right page */ if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { s64 lblock; struct metapage *mp = NULL; struct ldtentry *ldtentry; stbl = DT_GETSTBL(rp); for (n = 0; n < rp->header.nextindex; n++) { ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), rbn, n, &mp, &lblock); } if (mp) release_metapage(mp); } /* * insert the new entry into the new right/child page * (skip index in the new right page will not change) */ dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); /* * reset parent/root page * * set the 1st entry offset to 0, which force the left-most key * at any level of the tree to be less than any search key. * * The btree comparison code guarantees that the left-most key on any * level of the tree is never used, so it doesn't need to be filled in. */ BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the root page (in-memory inode) */ tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; /* update page header of root */ if (sp->header.flag & BT_LEAF) { sp->header.flag &= ~BT_LEAF; sp->header.flag |= BT_INTERNAL; } /* init the first entry */ s = (struct idtentry *) & sp->slot[DTENTRYSTART]; ppxd = (pxd_t *) s; *ppxd = *pxd; s->next = -1; s->namlen = 0; stbl = sp->header.stbl; stbl[0] = DTENTRYSTART; sp->header.nextindex = 1; /* init freelist */ fsi = DTENTRYSTART + 1; f = &sp->slot[fsi]; /* init free region of remaining area */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; sp->header.freelist = DTENTRYSTART + 1; sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); *rmpp = rmp; return 0; } /* * dtDelete() * * function: delete the entry(s) referenced by a key. * * parameter: * * return: */ int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, ino_t * ino, int flag) { int rc = 0; s64 bn; struct metapage *mp, *imp; dtpage_t *p; int index; struct btstack btstack; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; int i; struct ldtentry *ldtentry; u8 *stbl; u32 table_index, next_index; struct metapage *nmp; dtpage_t *np; /* * search for the entry to delete: * * dtSearch() returns (leaf page pinned, index at which to delete). */ if ((rc = dtSearch(ip, key, ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* * We need to find put the index of the next entry into the * directory index table in order to resume a readdir from this * entry. */ if (DO_INDEX(ip)) { stbl = DT_GETSTBL(p); ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; table_index = le32_to_cpu(ldtentry->index); if (index == (p->header.nextindex - 1)) { /* * Last entry in this leaf page */ if ((p->header.flag & BT_ROOT) || (p->header.next == 0)) next_index = -1; else { /* Read next leaf page */ DT_GETPAGE(ip, le64_to_cpu(p->header.next), nmp, PSIZE, np, rc); if (rc) next_index = -1; else { stbl = DT_GETSTBL(np); ldtentry = (struct ldtentry *) & np-> slot[stbl[0]]; next_index = le32_to_cpu(ldtentry->index); DT_PUTPAGE(nmp); } } } else { ldtentry = (struct ldtentry *) & p->slot[stbl[index + 1]]; next_index = le32_to_cpu(ldtentry->index); } free_index(tid, ip, table_index, next_index); } /* * the leaf page becomes empty, delete the page */ if (p->header.nextindex == 1) { /* delete empty page */ rc = dtDeleteUp(tid, ip, mp, p, &btstack); } /* * the leaf page has other entries remaining: * * delete the entry from the leaf page. */ else { BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* * Do not assume that dtlck->index will be zero. During a * rename within a directory, this transaction may have * modified this page already when adding the new entry. */ /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the leaf entry */ dtDeleteEntry(p, index, &dtlck); /* * Update directory index table for entries moved in stbl */ if (DO_INDEX(ip) && index < p->header.nextindex) { s64 lblock; imp = NULL; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { ldtentry = (struct ldtentry *) & p->slot[stbl[i]]; modify_index(tid, ip, le32_to_cpu(ldtentry->index), bn, i, &imp, &lblock); } if (imp) release_metapage(imp); } DT_PUTPAGE(mp); } return rc; } /* * dtDeleteUp() * * function: * free empty pages as propagating deletion up the tree * * parameter: * * return: */ static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) { int rc = 0; struct metapage *mp; dtpage_t *p; int index, nextindex; int xlen; struct btframe *parent; struct dt_lock *dtlck; struct tlock *tlck; struct lv *lv; struct pxd_lock *pxdlock; int i; /* * keep the root leaf page which has become empty */ if (BT_IS_ROOT(fmp)) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(fmp); return 0; } /* * free the non-root leaf page */ /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record * N.B. linelock is overlaid as freed extent descriptor, and * the buffer page is freed; */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = fp->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, fp))) { BT_PUTPAGE(fmp); return rc; } xlen = lengthPXD(&fp->header.self); /* Free quota allocation. */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(fmp); /* * propagate page deletion up the directory tree * * If the delete from the parent page makes it empty, * continue all the way up the tree. * stop if the root page is reached (which is never deleted) or * if the entry deletion does not empty the page. */ while ((parent = BT_POP(btstack)) != NULL) { /* pin the parent page <sp> */ DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); if (rc) return rc; /* * free the extent of the child page deleted */ index = parent->index; /* * delete the entry for the child page from parent */ nextindex = p->header.nextindex; /* * the parent has the single entry being deleted: * * free the parent page which has become empty. */ if (nextindex == 1) { /* * keep the root internal page which has become empty */ if (p->header.flag & BT_ROOT) { /* * reset the root * * dtInitRoot() acquires txlock on the root */ dtInitRoot(tid, ip, PARENT(ip)); DT_PUTPAGE(mp); return 0; } /* * free the parent page */ else { /* * acquire a transaction lock on the page * * write FREEXTENT|NOREDOPAGE log record */ tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = p->header.self; pxdlock->index = 1; /* update sibling pointers */ if ((rc = dtRelink(tid, ip, p))) { DT_PUTPAGE(mp); return rc; } xlen = lengthPXD(&p->header.self); /* Free quota allocation */ dquot_free_block(ip, xlen); /* free/invalidate its buffer page */ discard_metapage(mp); /* propagate up */ continue; } } /* * the parent has other entries remaining: * * delete the router entry from the parent page. */ BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the page * * action: router entry deletion */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; /* linelock stbl of non-root leaf page */ if (!(p->header.flag & BT_ROOT)) { if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } i = index >> L2DTSLOTSIZE; lv->offset = p->header.stblindex + i; lv->length = ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - i + 1; dtlck->index++; } /* free the router entry */ dtDeleteEntry(p, index, &dtlck); /* reset key of new leftmost entry of level (for consistency) */ if (index == 0 && ((p->header.flag & BT_ROOT) || p->header.prev == 0)) dtTruncateEntry(p, 0, &dtlck); /* unpin the parent page */ DT_PUTPAGE(mp); /* exit propagation up */ break; } if (!DO_INDEX(ip)) ip->i_size -= PSIZE; return 0; } /* * dtRelink() * * function: * link around a freed page. * * parameter: * fp: page to be freed * * return: */ static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) { int rc; struct metapage *mp; s64 nextbn, prevbn; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; nextbn = le64_to_cpu(p->header.next); prevbn = le64_to_cpu(p->header.prev); /* update prev pointer of the next page */ if (nextbn != 0) { DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page * * action: update prev pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.prev = cpu_to_le64(prevbn); DT_PUTPAGE(mp); } /* update next pointer of the previous page */ if (prevbn != 0) { DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the prev page * * action: update next pointer; */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", tlck, ip, mp); dtlck = (struct dt_lock *) & tlck->lock; /* linelock header */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = 0; lv->length = 1; dtlck->index++; p->header.next = cpu_to_le64(nextbn); DT_PUTPAGE(mp); } return 0; } /* * dtInitRoot() * * initialize directory root (inline in inode) */ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); dtroot_t *p; int fsi; struct dtslot *f; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; u16 xflag_save; /* * If this was previously an non-empty directory, we need to remove * the old directory table. */ if (DO_INDEX(ip)) { if (!jfs_dirtable_inline(ip)) { struct tblock *tblk = tid_to_tblock(tid); /* * We're playing games with the tid's xflag. If * we're removing a regular file, the file's xtree * is committed with COMMIT_PMAP, but we always * commit the directories xtree with COMMIT_PWMAP. */ xflag_save = tblk->xflag; tblk->xflag = 0; /* * xtTruncate isn't guaranteed to fully truncate * the xtree. The caller needs to check i_size * after committing the transaction to see if * additional truncation is needed. The * COMMIT_Stale flag tells caller that we * initiated the truncation. */ xtTruncate(tid, ip, 0, COMMIT_PWMAP); set_cflag(COMMIT_Stale, ip); tblk->xflag = xflag_save; } else ip->i_size = 1; jfs_ip->next_index = 2; } else ip->i_size = IDATASIZE; /* * acquire a transaction lock on the root * * action: directory initialization; */ tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, tlckDTREE | tlckENTRY | tlckBTROOT); dtlck = (struct dt_lock *) & tlck->lock; /* linelock root */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = DTROOTMAXSLOT; dtlck->index++; p = &jfs_ip->i_dtroot; p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; p->header.nextindex = 0; /* init freelist */ fsi = 1; f = &p->slot[fsi]; /* init data area of root */ for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) f->next = fsi; f->next = -1; p->header.freelist = 1; p->header.freecnt = 8; /* init '..' entry */ p->header.idotdot = cpu_to_le32(idotdot); return; } /* * add_missing_indices() * * function: Fix dtree page in which one or more entries has an invalid index. * fsck.jfs should really fix this, but it currently does not. * Called from jfs_readdir when bad index is detected. */ static void add_missing_indices(struct inode *inode, s64 bn) { struct ldtentry *d; struct dt_lock *dtlck; int i; uint index; struct lv *lv; struct metapage *mp; dtpage_t *p; int rc; s8 *stbl; tid_t tid; struct tlock *tlck; tid = txBegin(inode->i_sb, 0); DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); if (rc) { printk(KERN_ERR "DT_GETPAGE failed!\n"); goto end; } BT_MARK_DIRTY(mp, inode); ASSERT(p->header.flag & BT_LEAF); tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); if (BT_IS_ROOT(mp)) tlck->type |= tlckBTROOT; dtlck = (struct dt_lock *) &tlck->lock; stbl = DT_GETSTBL(p); for (i = 0; i < p->header.nextindex; i++) { d = (struct ldtentry *) &p->slot[stbl[i]]; index = le32_to_cpu(d->index); if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { d->index = cpu_to_le32(add_index(tid, inode, bn, i)); if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = &dtlck->lv[dtlck->index]; lv->offset = stbl[i]; lv->length = 1; dtlck->index++; } } DT_PUTPAGE(mp); (void) txCommit(tid, 1, &inode, 0); end: txEnd(tid); } /* * Buffer to hold directory entry info while traversing a dtree page * before being fed to the filldir function */ struct jfs_dirent { loff_t position; int ino; u16 name_len; char name[]; }; /* * function to determine next variable-sized jfs_dirent in buffer */ static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) { return (struct jfs_dirent *) ((char *)dirent + ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + sizeof (loff_t) - 1) & ~(sizeof (loff_t) - 1))); } /* * jfs_readdir() * * function: read directory entries sequentially * from the specified entry offset * * parameter: * * return: offset = (pn, index) of start entry * of next jfs_readdir()/dtRead() */ int jfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *ip = file_inode(file); struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; int rc = 0; loff_t dtpos; /* legacy OS/2 style position */ struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) &dtpos; s64 bn; struct metapage *mp; dtpage_t *p; int index; s8 *stbl; struct btstack btstack; int i, next; struct ldtentry *d; struct dtslot *t; int d_namleft, len, outlen; unsigned long dirent_buf; char *name_ptr; u32 dir_index; int do_index = 0; uint loop_count = 0; struct jfs_dirent *jfs_dirent; int jfs_dirents; int overflow, fix_page, page_fixed = 0; static int unique_pos = 2; /* If we can't fix broken index */ if (ctx->pos == DIREND) return 0; if (DO_INDEX(ip)) { /* * persistent index is stored in directory entries. * Special cases: 0 = . * 1 = .. * -1 = End of directory */ do_index = 1; dir_index = (u32) ctx->pos; /* * NFSv4 reserves cookies 1 and 2 for . and .. so the value * we return to the vfs is one greater than the one we use * internally. */ if (dir_index) dir_index--; if (dir_index > 1) { struct dir_table_slot dirtab_slot; if (dtEmpty(ip) || (dir_index >= JFS_IP(ip)->next_index)) { /* Stale position. Directory has shrunk */ ctx->pos = DIREND; return 0; } repeat: rc = read_index(ip, dir_index, &dirtab_slot); if (rc) { ctx->pos = DIREND; return rc; } if (dirtab_slot.flag == DIR_INDEX_FREE) { if (loop_count++ > JFS_IP(ip)->next_index) { jfs_err("jfs_readdir detected infinite loop!"); ctx->pos = DIREND; return 0; } dir_index = le32_to_cpu(dirtab_slot.addr2); if (dir_index == -1) { ctx->pos = DIREND; return 0; } goto repeat; } bn = addressDTS(&dirtab_slot); index = dirtab_slot.slot; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { ctx->pos = DIREND; return 0; } if (p->header.flag & BT_INTERNAL) { jfs_err("jfs_readdir: bad index table"); DT_PUTPAGE(mp); ctx->pos = DIREND; return 0; } } else { if (dir_index == 0) { /* * self "." */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; } /* * parent ".." */ ctx->pos = 2; if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; /* * Find first entry of left-most leaf */ if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadFirst(ip, &btstack))) return rc; DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); } } else { /* * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 * * pn = 0; index = 1: First entry "." * pn = 0; index = 2: Second entry ".." * pn > 0: Real entries, pn=1 -> leftmost page * pn = index = -1: No more entries */ dtpos = ctx->pos; if (dtpos < 2) { /* build "." entry */ ctx->pos = 1; if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) return 0; dtoffset->index = 2; ctx->pos = dtpos; } if (dtoffset->pn == 0) { if (dtoffset->index == 2) { /* build ".." entry */ if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) return 0; } else { jfs_err("jfs_readdir called with invalid offset!"); } dtoffset->pn = 1; dtoffset->index = 0; ctx->pos = dtpos; } if (dtEmpty(ip)) { ctx->pos = DIREND; return 0; } if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext", rc); ctx->pos = DIREND; return 0; } /* get start leaf page and index */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* offset beyond directory eof ? */ if (bn < 0) { ctx->pos = DIREND; return 0; } } dirent_buf = __get_free_page(GFP_KERNEL); if (dirent_buf == 0) { DT_PUTPAGE(mp); jfs_warn("jfs_readdir: __get_free_page failed!"); ctx->pos = DIREND; return -ENOMEM; } while (1) { jfs_dirent = (struct jfs_dirent *) dirent_buf; jfs_dirents = 0; overflow = fix_page = 0; stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { d = (struct ldtentry *) & p->slot[stbl[i]]; if (((long) jfs_dirent + d->namlen + 1) > (dirent_buf + PAGE_SIZE)) { /* DBCS codepages could overrun dirent_buf */ index = i; overflow = 1; break; } d_namleft = d->namlen; name_ptr = jfs_dirent->name; jfs_dirent->ino = le32_to_cpu(d->inumber); if (do_index) { len = min(d_namleft, DTLHDRDATALEN); jfs_dirent->position = le32_to_cpu(d->index); /* * d->index should always be valid, but it * isn't. fsck.jfs doesn't create the * directory index for the lost+found * directory. Rather than let it go, * we can try to fix it. */ if ((jfs_dirent->position < 2) || (jfs_dirent->position >= JFS_IP(ip)->next_index)) { if (!page_fixed && !isReadOnly(ip)) { fix_page = 1; /* * setting overflow and setting * index to i will cause the * same page to be processed * again starting here */ overflow = 1; index = i; break; } jfs_dirent->position = unique_pos++; } /* * We add 1 to the index because we may * use a value of 2 internally, and NFSv4 * doesn't like that. */ jfs_dirent->position++; } else { jfs_dirent->position = dtpos; len = min(d_namleft, DTLHDRDATALEN_LEGACY); } /* copy the name of head/only segment */ outlen = jfs_strfromUCS_le(name_ptr, d->name, len, codepage); jfs_dirent->name_len = outlen; /* copy name in the additional segment(s) */ next = d->next; while (next >= 0) { t = (struct dtslot *) & p->slot[next]; name_ptr += outlen; d_namleft -= len; /* Sanity Check */ if (d_namleft == 0) { jfs_error(ip->i_sb, "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n", (long)ip->i_ino, (long long)bn, i); goto skip_one; } len = min(d_namleft, DTSLOTDATALEN); outlen = jfs_strfromUCS_le(name_ptr, t->name, len, codepage); jfs_dirent->name_len += outlen; next = t->next; } jfs_dirents++; jfs_dirent = next_jfs_dirent(jfs_dirent); skip_one: if (!do_index) dtoffset->index++; } if (!overflow) { /* Point to next leaf page */ if (p->header.flag & BT_ROOT) bn = 0; else { bn = le64_to_cpu(p->header.next); index = 0; /* update offset (pn:index) for new page */ if (!do_index) { dtoffset->pn++; dtoffset->index = 0; } } page_fixed = 0; } /* unpin previous leaf page */ DT_PUTPAGE(mp); jfs_dirent = (struct jfs_dirent *) dirent_buf; while (jfs_dirents--) { ctx->pos = jfs_dirent->position; if (!dir_emit(ctx, jfs_dirent->name, jfs_dirent->name_len, jfs_dirent->ino, DT_UNKNOWN)) goto out; jfs_dirent = next_jfs_dirent(jfs_dirent); } if (fix_page) { add_missing_indices(ip, bn); page_fixed = 1; } if (!overflow && (bn == 0)) { ctx->pos = DIREND; break; } DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) { free_page(dirent_buf); return rc; } } out: free_page(dirent_buf); return rc; } /* * dtReadFirst() * * function: get the leftmost page of the directory */ static int dtReadFirst(struct inode *ip, struct btstack * btstack) { int rc = 0; s64 bn; int psize = 288; /* initial in-line directory */ struct metapage *mp; dtpage_t *p; s8 *stbl; struct btframe *btsp; pxd_t *xd; BT_CLR(btstack); /* reset stack */ /* * descend leftmost path of the tree * * by convention, root bn = 0. */ for (bn = 0;;) { DT_GETPAGE(ip, bn, mp, psize, p, rc); if (rc) return rc; /* * leftmost leaf page */ if (p->header.flag & BT_LEAF) { /* return leftmost entry */ btsp = btstack->top; btsp->bn = bn; btsp->index = 0; btsp->mp = mp; return 0; } /* * descend down to leftmost child page */ if (BT_STACK_FULL(btstack)) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "btstack overrun\n"); BT_STACK_DUMP(btstack); return -EIO; } /* push (bn, index) of the parent page/entry */ BT_PUSH(btstack, bn, 0); /* get the leftmost entry */ stbl = DT_GETSTBL(p); xd = (pxd_t *) & p->slot[stbl[0]]; /* get the child page block address */ bn = addressPXD(xd); psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; /* unpin the parent page */ DT_PUTPAGE(mp); } } /* * dtReadNext() * * function: get the page of the specified offset (pn:index) * * return: if (offset > eof), bn = -1; * * note: if index > nextindex of the target leaf page, * start with 1st entry of next leaf page; */ static int dtReadNext(struct inode *ip, loff_t * offset, struct btstack * btstack) { int rc = 0; struct dtoffset { s16 pn; s16 index; s32 unused; } *dtoffset = (struct dtoffset *) offset; s64 bn; struct metapage *mp; dtpage_t *p; int index; int pn; s8 *stbl; struct btframe *btsp, *parent; pxd_t *xd; /* * get leftmost leaf page pinned */ if ((rc = dtReadFirst(ip, btstack))) return rc; /* get leaf page */ DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); /* get the start offset (pn:index) */ pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ index = dtoffset->index; /* start at leftmost page ? */ if (pn == 0) { /* offset beyond eof ? */ if (index < p->header.nextindex) goto out; if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = index = 0; goto a; } /* start at non-leftmost page: scan parent pages for large pn */ if (p->header.flag & BT_ROOT) { bn = -1; goto out; } /* start after next leaf page ? */ if (pn > 1) goto b; /* get leaf page pn = 1 */ a: bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } goto c; /* * scan last internal page level to get target leaf page */ b: /* unpin leftmost leaf page */ DT_PUTPAGE(mp); /* get left most parent page */ btsp = btstack->top; parent = btsp - 1; bn = parent->bn; DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* scan parent pages at last internal page level */ while (pn >= p->header.nextindex) { pn -= p->header.nextindex; /* get next parent page address */ bn = le64_to_cpu(p->header.next); /* unpin current parent page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next parent page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* update parent page stack frame */ parent->bn = bn; } /* get leaf page address */ stbl = DT_GETSTBL(p); xd = (pxd_t *) & p->slot[stbl[pn]]; bn = addressPXD(xd); /* unpin parent page */ DT_PUTPAGE(mp); /* * get target leaf page */ c: DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * leaf page has been completed: * start with 1st entry of next leaf page */ if (index >= p->header.nextindex) { bn = le64_to_cpu(p->header.next); /* unpin leaf page */ DT_PUTPAGE(mp); /* offset beyond eof ? */ if (bn == 0) { bn = -1; goto out; } /* get next leaf page */ DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* start with 1st entry of next leaf page */ dtoffset->pn++; dtoffset->index = 0; } out: /* return target leaf page pinned */ btsp = btstack->top; btsp->bn = bn; btsp->index = dtoffset->index; btsp->mp = mp; return 0; } /* * dtCompare() * * function: compare search key with an internal entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int dtCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si) { /* entry slot index */ wchar_t *kname; __le16 *name; int klen, namlen, len, rc; struct idtentry *ih; struct dtslot *t; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); /* compare with head/only segment */ len = min(klen, len); if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; /* compare with additional segment(s) */ kname += len; while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; if ((rc = UniStrncmp_le(kname, name, len))) return rc; klen -= len; namlen -= len; kname += len; si = t->next; } return (klen - namlen); } /* * ciCompare() * * function: compare search key with an (leaf/internal) entry * * return: * < 0 if k is < record * = 0 if k is = record * > 0 if k is > record */ static int ciCompare(struct component_name * key, /* search key */ dtpage_t * p, /* directory page */ int si, /* entry slot index */ int flag) { wchar_t *kname, x; __le16 *name; int klen, namlen, len, rc; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int i; /* * force the left-most key on internal pages, at any level of * the tree, to be less than any search key. * this obviates having to update the leftmost key on an internal * page when the user inserts a new key in the tree smaller than * anything that has been stored. * * (? if/when dtSearch() narrows down to 1st entry (index = 0), * at any internal page at any level of the tree, * it descends to child of the entry anyway - * ? make the entry as min size dummy entry) * * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) * return (1); */ kname = key->name; klen = key->namlen; /* * leaf page entry */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; name = lh->name; namlen = lh->namlen; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } /* * internal page entry */ else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; name = ih->name; namlen = ih->namlen; len = min(namlen, DTIHDRDATALEN); } /* compare with head/only segment */ len = min(klen, len); for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; /* compare with additional segment(s) */ while (klen > 0 && namlen > 0) { /* compare with next name segment */ t = (struct dtslot *) & p->slot[si]; len = min(namlen, DTSLOTDATALEN); len = min(klen, len); name = t->name; for (i = 0; i < len; i++, kname++, name++) { /* only uppercase if case-insensitive support is on */ if ((flag & JFS_OS2) == JFS_OS2) x = UniToupper(le16_to_cpu(*name)); else x = le16_to_cpu(*name); if ((rc = *kname - x)) return rc; } klen -= len; namlen -= len; si = t->next; } return (klen - namlen); } /* * ciGetLeafPrefixKey() * * function: compute prefix of suffix compression * from two adjacent leaf entries * across page boundary * * return: non-zero on error * */ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag) { int klen, namlen; wchar_t *pl, *pr, *kname; struct component_name lkey; struct component_name rkey; lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (lkey.name == NULL) return -ENOMEM; rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), GFP_KERNEL); if (rkey.name == NULL) { kfree(lkey.name); return -ENOMEM; } /* get left and right key */ dtGetKey(lp, li, &lkey, flag); lkey.name[lkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&lkey); dtGetKey(rp, ri, &rkey, flag); rkey.name[rkey.namlen] = 0; if ((flag & JFS_OS2) == JFS_OS2) ciToUpper(&rkey); /* compute prefix */ klen = 0; kname = key->name; namlen = min(lkey.namlen, rkey.namlen); for (pl = lkey.name, pr = rkey.name; namlen; pl++, pr++, namlen--, klen++, kname++) { *kname = *pr; if (*pl != *pr) { key->namlen = klen + 1; goto free_names; } } /* l->namlen <= r->namlen since l <= r */ if (lkey.namlen < rkey.namlen) { *kname = *pr; key->namlen = klen + 1; } else /* l->namelen == r->namelen */ key->namlen = klen; free_names: kfree(lkey.name); kfree(rkey.name); return 0; } /* * dtGetKey() * * function: get key of the entry */ static void dtGetKey(dtpage_t * p, int i, /* entry index */ struct component_name * key, int flag) { int si; s8 *stbl; struct ldtentry *lh; struct idtentry *ih; struct dtslot *t; int namlen, len; wchar_t *kname; __le16 *name; /* get entry */ stbl = DT_GETSTBL(p); si = stbl[i]; if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) & p->slot[si]; si = lh->next; namlen = lh->namlen; name = lh->name; if (flag & JFS_DIR_INDEX) len = min(namlen, DTLHDRDATALEN); else len = min(namlen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) & p->slot[si]; si = ih->next; namlen = ih->namlen; name = ih->name; len = min(namlen, DTIHDRDATALEN); } key->namlen = namlen; kname = key->name; /* * move head/only segment */ UniStrncpy_from_le(kname, name, len); /* * move additional segment(s) */ while (si >= 0) { /* get next segment */ t = &p->slot[si]; kname += len; namlen -= len; len = min(namlen, DTSLOTDATALEN); UniStrncpy_from_le(kname, t->name, len); si = t->next; } } /* * dtInsertEntry() * * function: allocate free slot(s) and * write a leaf/internal entry * * return: entry slot index */ static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, ddata_t * data, struct dt_lock ** dtlock) { struct dtslot *h, *t; struct ldtentry *lh = NULL; struct idtentry *ih = NULL; int hsi, fsi, klen, len, nextindex; wchar_t *kname; __le16 *name; s8 *stbl; pxd_t *xd; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; s64 bn = 0; struct metapage *mp = NULL; klen = key->namlen; kname = key->name; /* allocate a free slot */ hsi = fsi = p->header.freelist; h = &p->slot[fsi]; p->header.freelist = h->next; --p->header.freecnt; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = hsi; /* write head/only segment */ if (p->header.flag & BT_LEAF) { lh = (struct ldtentry *) h; lh->next = h->next; lh->inumber = cpu_to_le32(data->leaf.ino); lh->namlen = klen; name = lh->name; if (data->leaf.ip) { len = min(klen, DTLHDRDATALEN); if (!(p->header.flag & BT_ROOT)) bn = addressPXD(&p->header.self); lh->index = cpu_to_le32(add_index(data->leaf.tid, data->leaf.ip, bn, index)); } else len = min(klen, DTLHDRDATALEN_LEGACY); } else { ih = (struct idtentry *) h; ih->next = h->next; xd = (pxd_t *) ih; *xd = data->xd; ih->namlen = klen; name = ih->name; len = min(klen, DTIHDRDATALEN); } UniStrncpy_to_le(name, kname, len); n = 1; xsi = hsi; /* write additional segment(s) */ t = h; klen -= len; while (klen) { /* get free slot */ fsi = p->header.freelist; t = &p->slot[fsi]; p->header.freelist = t->next; --p->header.freecnt; /* is next slot contiguous ? */ if (fsi != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = fsi; n = 0; } kname += len; len = min(klen, DTSLOTDATALEN); UniStrncpy_to_le(t->name, kname, len); n++; xsi = fsi; klen -= len; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* terminate last/only segment */ if (h == t) { /* single segment entry */ if (p->header.flag & BT_LEAF) lh->next = -1; else ih->next = -1; } else /* multi-segment entry */ t->next = -1; /* if insert into middle, shift right succeeding entries in stbl */ stbl = DT_GETSTBL(p); nextindex = p->header.nextindex; if (index < nextindex) { memmove(stbl + index + 1, stbl + index, nextindex - index); if ((p->header.flag & BT_LEAF) && data->leaf.ip) { s64 lblock; /* * Need to update slot number for entries that moved * in the stbl */ mp = NULL; for (n = index + 1; n <= nextindex; n++) { lh = (struct ldtentry *) & (p->slot[stbl[n]]); modify_index(data->leaf.tid, data->leaf.ip, le32_to_cpu(lh->index), bn, n, &mp, &lblock); } if (mp) release_metapage(mp); } } stbl[index] = hsi; /* advance next available entry index of stbl */ ++p->header.nextindex; } /* * dtMoveEntry() * * function: move entries from split/left page to new/right page * * nextindex of dst page and freelist/freecnt of both pages * are updated. */ static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, int do_index) { int ssi, next; /* src slot index */ int di; /* dst entry index */ int dsi; /* dst slot index */ s8 *sstbl, *dstbl; /* sorted entry table */ int snamlen, len; struct ldtentry *slh, *dlh = NULL; struct idtentry *sih, *dih = NULL; struct dtslot *h, *s, *d; struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; struct lv *slv, *dlv; int xssi, ns, nd; int sfsi; sstbl = (s8 *) & sp->slot[sp->header.stblindex]; dstbl = (s8 *) & dp->slot[dp->header.stblindex]; dsi = dp->header.freelist; /* first (whole page) free slot */ sfsi = sp->header.freelist; /* linelock destination entry slot */ dlv = & ddtlck->lv[ddtlck->index]; dlv->offset = dsi; /* linelock source entry slot */ slv = & sdtlck->lv[sdtlck->index]; slv->offset = sstbl[si]; xssi = slv->offset - 1; /* * move entries */ ns = nd = 0; for (di = 0; si < sp->header.nextindex; si++, di++) { ssi = sstbl[si]; dstbl[di] = dsi; /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* * move head/only segment of an entry */ /* get dst slot */ h = d = &dp->slot[dsi]; /* get src slot and move */ s = &sp->slot[ssi]; if (sp->header.flag & BT_LEAF) { /* get source entry */ slh = (struct ldtentry *) s; dlh = (struct ldtentry *) h; snamlen = slh->namlen; if (do_index) { len = min(snamlen, DTLHDRDATALEN); dlh->index = slh->index; /* little-endian */ } else len = min(snamlen, DTLHDRDATALEN_LEGACY); memcpy(dlh, slh, 6 + len * 2); next = slh->next; /* update dst head/only segment next field */ dsi++; dlh->next = dsi; } else { sih = (struct idtentry *) s; snamlen = sih->namlen; len = min(snamlen, DTIHDRDATALEN); dih = (struct idtentry *) h; memcpy(dih, sih, 10 + len * 2); next = sih->next; dsi++; dih->next = dsi; } /* free src head/only segment */ s->next = sfsi; s->cnt = 1; sfsi = ssi; ns++; nd++; xssi = ssi; /* * move additional segment(s) of the entry */ snamlen -= len; while ((ssi = next) >= 0) { /* is next slot contiguous ? */ if (ssi != xssi + 1) { /* close current linelock */ slv->length = ns; sdtlck->index++; /* open new linelock */ if (sdtlck->index < sdtlck->maxcnt) slv++; else { sdtlck = (struct dt_lock *) txLinelock(sdtlck); slv = & sdtlck->lv[0]; } slv->offset = ssi; ns = 0; } /* get next source segment */ s = &sp->slot[ssi]; /* get next destination free slot */ d++; len = min(snamlen, DTSLOTDATALEN); UniStrncpy_le(d->name, s->name, len); ns++; nd++; xssi = ssi; dsi++; d->next = dsi; /* free source segment */ next = s->next; s->next = sfsi; s->cnt = 1; sfsi = ssi; snamlen -= len; } /* end while */ /* terminate dst last/only segment */ if (h == d) { /* single segment entry */ if (dp->header.flag & BT_LEAF) dlh->next = -1; else dih->next = -1; } else /* multi-segment entry */ d->next = -1; } /* end for */ /* close current linelock */ slv->length = ns; sdtlck->index++; *sdtlock = sdtlck; dlv->length = nd; ddtlck->index++; *ddtlock = ddtlck; /* update source header */ sp->header.freelist = sfsi; sp->header.freecnt += nd; /* update destination header */ dp->header.nextindex = di; dp->header.freelist = dsi; dp->header.freecnt -= nd; } /* * dtDeleteEntry() * * function: free a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); fsi = stbl[fi]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; /* get the head/only segment */ t = &p->slot[fsi]; if (p->header.flag & BT_LEAF) si = ((struct ldtentry *) t)->next; else si = ((struct idtentry *) t)->next; t->next = si; t->cnt = 1; n = freecnt = 1; xsi = fsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; /* if delete from middle, * shift left the succedding entries in the stbl */ si = p->header.nextindex; if (fi < si - 1) memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); p->header.nextindex--; } /* * dtTruncateEntry() * * function: truncate a (leaf/internal) entry * * log freelist header, stbl, and each segment slot of entry * (even though last/only segment next field is modified, * physical image logging requires all segment slots of * the entry logged to avoid applying previous updates * to the same slots) */ static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) { int tsi; /* truncate entry slot index */ s8 *stbl; struct dtslot *t; int si, freecnt; struct dt_lock *dtlck = *dtlock; struct lv *lv; int fsi, xsi, n; /* get free entry slot index */ stbl = DT_GETSTBL(p); tsi = stbl[ti]; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = tsi; /* get the head/only segment */ t = &p->slot[tsi]; ASSERT(p->header.flag & BT_INTERNAL); ((struct idtentry *) t)->namlen = 0; si = ((struct idtentry *) t)->next; ((struct idtentry *) t)->next = -1; n = 1; freecnt = 0; fsi = si; xsi = tsi; /* find the last/only segment */ while (si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; freecnt++; t = &p->slot[si]; t->cnt = 1; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; /* update freelist */ if (freecnt == 0) return; t->next = p->header.freelist; p->header.freelist = fsi; p->header.freecnt += freecnt; } /* * dtLinelockFreelist() */ static void dtLinelockFreelist(dtpage_t * p, /* directory page */ int m, /* max slot index */ struct dt_lock ** dtlock) { int fsi; /* free entry slot index */ struct dtslot *t; int si; struct dt_lock *dtlck = *dtlock; struct lv *lv; int xsi, n; /* get free entry slot index */ fsi = p->header.freelist; /* open new linelock */ if (dtlck->index >= dtlck->maxcnt) dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[dtlck->index]; lv->offset = fsi; n = 1; xsi = fsi; t = &p->slot[fsi]; si = t->next; /* find the last/only segment */ while (si < m && si >= 0) { /* is next slot contiguous ? */ if (si != xsi + 1) { /* close current linelock */ lv->length = n; dtlck->index++; /* open new linelock */ if (dtlck->index < dtlck->maxcnt) lv++; else { dtlck = (struct dt_lock *) txLinelock(dtlck); lv = & dtlck->lv[0]; } lv->offset = si; n = 0; } n++; xsi = si; t = &p->slot[si]; si = t->next; } /* close current linelock */ lv->length = n; dtlck->index++; *dtlock = dtlck; } /* * NAME: dtModify * * FUNCTION: Modify the inode number part of a directory entry * * PARAMETERS: * tid - Transaction id * ip - Inode of parent directory * key - Name of entry to be modified * orig_ino - Original inode number expected in entry * new_ino - New inode number to put into entry * flag - JFS_RENAME * * RETURNS: * -ESTALE - If entry found does not match orig_ino passed in * -ENOENT - If no entry can be found to match key * 0 - If successfully modified entry */ int dtModify(tid_t tid, struct inode *ip, struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) { int rc; s64 bn; struct metapage *mp; dtpage_t *p; int index; struct btstack btstack; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; s8 *stbl; int entry_si; /* entry slot index */ struct ldtentry *entry; /* * search for the entry to modify: * * dtSearch() returns (leaf page pinned, index at which to modify). */ if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) return rc; /* retrieve search result */ DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page of named entry */ tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); dtlck = (struct dt_lock *) & tlck->lock; /* get slot index of the entry */ stbl = DT_GETSTBL(p); entry_si = stbl[index]; /* linelock entry */ ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = entry_si; lv->length = 1; dtlck->index++; /* get the head/only segment */ entry = (struct ldtentry *) & p->slot[entry_si]; /* substitute the inode number of the entry */ entry->inumber = cpu_to_le32(new_ino); /* unpin the leaf page */ DT_PUTPAGE(mp); return 0; }
8 8 2 2 4 1 3 1 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/nfc.h> #include <net/nfc/nfc.h> #include "nfc.h" #include "llcp.h" static const u8 llcp_tlv_length[LLCP_TLV_MAX] = { 0, 1, /* VERSION */ 2, /* MIUX */ 2, /* WKS */ 1, /* LTO */ 1, /* RW */ 0, /* SN */ 1, /* OPT */ 0, /* SDREQ */ 2, /* SDRES */ }; static u8 llcp_tlv8(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; return tlv[2]; } static u16 llcp_tlv16(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; return be16_to_cpu(*((__be16 *)(tlv + 2))); } static u8 llcp_tlv_version(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_VERSION); } static u16 llcp_tlv_miux(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_MIUX) & 0x7ff; } static u16 llcp_tlv_wks(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_WKS); } static u16 llcp_tlv_lto(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_LTO); } static u8 llcp_tlv_opt(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_OPT); } static u8 llcp_tlv_rw(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_RW) & 0xf; } u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length) { u8 *tlv, length; pr_debug("type %d\n", type); if (type >= LLCP_TLV_MAX) return NULL; length = llcp_tlv_length[type]; if (length == 0 && value_length == 0) return NULL; else if (length == 0) length = value_length; *tlv_length = 2 + length; tlv = kzalloc(2 + length, GFP_KERNEL); if (tlv == NULL) return tlv; tlv[0] = type; tlv[1] = length; memcpy(tlv + 2, value, length); return tlv; } struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap) { struct nfc_llcp_sdp_tlv *sdres; u8 value[2]; sdres = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); if (sdres == NULL) return NULL; value[0] = tid; value[1] = sap; sdres->tlv = nfc_llcp_build_tlv(LLCP_TLV_SDRES, value, 2, &sdres->tlv_len); if (sdres->tlv == NULL) { kfree(sdres); return NULL; } sdres->tid = tid; sdres->sap = sap; INIT_HLIST_NODE(&sdres->node); return sdres; } struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len) { struct nfc_llcp_sdp_tlv *sdreq; pr_debug("uri: %s, len: %zu\n", uri, uri_len); /* sdreq->tlv_len is u8, takes uri_len, + 3 for header, + 1 for NULL */ if (WARN_ON_ONCE(uri_len > U8_MAX - 4)) return NULL; sdreq = kzalloc(sizeof(struct nfc_llcp_sdp_tlv), GFP_KERNEL); if (sdreq == NULL) return NULL; sdreq->tlv_len = uri_len + 3; if (uri[uri_len - 1] == 0) sdreq->tlv_len--; sdreq->tlv = kzalloc(sdreq->tlv_len + 1, GFP_KERNEL); if (sdreq->tlv == NULL) { kfree(sdreq); return NULL; } sdreq->tlv[0] = LLCP_TLV_SDREQ; sdreq->tlv[1] = sdreq->tlv_len - 2; sdreq->tlv[2] = tid; sdreq->tid = tid; sdreq->uri = sdreq->tlv + 3; memcpy(sdreq->uri, uri, uri_len); sdreq->time = jiffies; INIT_HLIST_NODE(&sdreq->node); return sdreq; } void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp) { kfree(sdp->tlv); kfree(sdp); } void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head) { struct nfc_llcp_sdp_tlv *sdp; struct hlist_node *n; hlist_for_each_entry_safe(sdp, n, head, node) { hlist_del(&sdp->node); nfc_llcp_free_sdp_tlv(sdp); } } int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, const u8 *tlv_array, u16 tlv_array_len) { const u8 *tlv = tlv_array; u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); if (local == NULL) return -ENODEV; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); switch (type) { case LLCP_TLV_VERSION: local->remote_version = llcp_tlv_version(tlv); break; case LLCP_TLV_MIUX: local->remote_miu = llcp_tlv_miux(tlv) + 128; break; case LLCP_TLV_WKS: local->remote_wks = llcp_tlv_wks(tlv); break; case LLCP_TLV_LTO: local->remote_lto = llcp_tlv_lto(tlv) * 10; break; case LLCP_TLV_OPT: local->remote_opt = llcp_tlv_opt(tlv); break; default: pr_err("Invalid gt tlv value 0x%x\n", type); break; } offset += length + 2; tlv += length + 2; } pr_debug("version 0x%x miu %d lto %d opt 0x%x wks 0x%x\n", local->remote_version, local->remote_miu, local->remote_lto, local->remote_opt, local->remote_wks); return 0; } int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, const u8 *tlv_array, u16 tlv_array_len) { const u8 *tlv = tlv_array; u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); if (sock == NULL) return -ENOTCONN; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); switch (type) { case LLCP_TLV_MIUX: sock->remote_miu = llcp_tlv_miux(tlv) + 128; break; case LLCP_TLV_RW: sock->remote_rw = llcp_tlv_rw(tlv); break; case LLCP_TLV_SN: break; default: pr_err("Invalid gt tlv value 0x%x\n", type); break; } offset += length + 2; tlv += length + 2; } pr_debug("sock %p rw %d miu %d\n", sock, sock->remote_rw, sock->remote_miu); return 0; } static struct sk_buff *llcp_add_header(struct sk_buff *pdu, u8 dsap, u8 ssap, u8 ptype) { u8 header[2]; pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap); header[0] = (u8)((dsap << 2) | (ptype >> 2)); header[1] = (u8)((ptype << 6) | ssap); pr_debug("header 0x%x 0x%x\n", header[0], header[1]); skb_put_data(pdu, header, LLCP_HEADER_SIZE); return pdu; } static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, const u8 *tlv, u8 tlv_length) { /* XXX Add an skb length check */ if (tlv == NULL) return NULL; skb_put_data(pdu, tlv, tlv_length); return pdu; } static struct sk_buff *llcp_allocate_pdu(struct nfc_llcp_sock *sock, u8 cmd, u16 size) { struct sk_buff *skb; int err; if (sock->ssap == 0) return NULL; skb = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT, size + LLCP_HEADER_SIZE, &err); if (skb == NULL) { pr_err("Could not allocate PDU\n"); return NULL; } skb = llcp_add_header(skb, sock->dsap, sock->ssap, cmd); return skb; } int nfc_llcp_send_disconnect(struct nfc_llcp_sock *sock) { struct sk_buff *skb; struct nfc_dev *dev; struct nfc_llcp_local *local; local = sock->local; if (local == NULL) return -ENODEV; dev = sock->dev; if (dev == NULL) return -ENODEV; skb = llcp_allocate_pdu(sock, LLCP_PDU_DISC, 0); if (skb == NULL) return -ENOMEM; skb_queue_tail(&local->tx_queue, skb); return 0; } int nfc_llcp_send_symm(struct nfc_dev *dev) { struct sk_buff *skb; struct nfc_llcp_local *local; u16 size = 0; int err; local = nfc_llcp_find_local(dev); if (local == NULL) return -ENODEV; size += LLCP_HEADER_SIZE; size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = alloc_skb(size, GFP_KERNEL); if (skb == NULL) { err = -ENOMEM; goto out; } skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); skb = llcp_add_header(skb, 0, 0, LLCP_PDU_SYMM); __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); err = nfc_data_exchange(dev, local->target_idx, skb, nfc_llcp_recv, local); out: nfc_llcp_local_put(local); return err; } int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; const u8 *service_name_tlv = NULL; const u8 *miux_tlv = NULL; const u8 *rw_tlv = NULL; u8 service_name_tlv_length = 0; u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; local = sock->local; if (local == NULL) return -ENODEV; if (sock->service_name != NULL) { service_name_tlv = nfc_llcp_build_tlv(LLCP_TLV_SN, sock->service_name, sock->service_name_len, &service_name_tlv_length); if (!service_name_tlv) { err = -ENOMEM; goto error_tlv; } size += service_name_tlv_length; } /* If the socket parameters are not set, use the local ones */ miux = be16_to_cpu(sock->miux) > LLCP_MAX_MIUX ? local->miux : sock->miux; rw = sock->rw > LLCP_MAX_RW ? local->rw : sock->rw; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); if (!miux_tlv) { err = -ENOMEM; goto error_tlv; } size += miux_tlv_length; rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); if (!rw_tlv) { err = -ENOMEM; goto error_tlv; } size += rw_tlv_length; pr_debug("SKB size %d SN length %zu\n", size, sock->service_name_len); skb = llcp_allocate_pdu(sock, LLCP_PDU_CONNECT, size); if (skb == NULL) { err = -ENOMEM; goto error_tlv; } llcp_add_tlv(skb, service_name_tlv, service_name_tlv_length); llcp_add_tlv(skb, miux_tlv, miux_tlv_length); llcp_add_tlv(skb, rw_tlv, rw_tlv_length); skb_queue_tail(&local->tx_queue, skb); err = 0; error_tlv: if (err) pr_err("error %d\n", err); kfree(service_name_tlv); kfree(miux_tlv); kfree(rw_tlv); return err; } int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; const u8 *miux_tlv = NULL; const u8 *rw_tlv = NULL; u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; local = sock->local; if (local == NULL) return -ENODEV; /* If the socket parameters are not set, use the local ones */ miux = be16_to_cpu(sock->miux) > LLCP_MAX_MIUX ? local->miux : sock->miux; rw = sock->rw > LLCP_MAX_RW ? local->rw : sock->rw; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); if (!miux_tlv) { err = -ENOMEM; goto error_tlv; } size += miux_tlv_length; rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); if (!rw_tlv) { err = -ENOMEM; goto error_tlv; } size += rw_tlv_length; skb = llcp_allocate_pdu(sock, LLCP_PDU_CC, size); if (skb == NULL) { err = -ENOMEM; goto error_tlv; } llcp_add_tlv(skb, miux_tlv, miux_tlv_length); llcp_add_tlv(skb, rw_tlv, rw_tlv_length); skb_queue_tail(&local->tx_queue, skb); err = 0; error_tlv: if (err) pr_err("error %d\n", err); kfree(miux_tlv); kfree(rw_tlv); return err; } static struct sk_buff *nfc_llcp_allocate_snl(struct nfc_llcp_local *local, size_t tlv_length) { struct sk_buff *skb; struct nfc_dev *dev; u16 size = 0; if (local == NULL) return ERR_PTR(-ENODEV); dev = local->dev; if (dev == NULL) return ERR_PTR(-ENODEV); size += LLCP_HEADER_SIZE; size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; size += tlv_length; skb = alloc_skb(size, GFP_KERNEL); if (skb == NULL) return ERR_PTR(-ENOMEM); skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); skb = llcp_add_header(skb, LLCP_SAP_SDP, LLCP_SAP_SDP, LLCP_PDU_SNL); return skb; } int nfc_llcp_send_snl_sdres(struct nfc_llcp_local *local, struct hlist_head *tlv_list, size_t tlvs_len) { struct nfc_llcp_sdp_tlv *sdp; struct hlist_node *n; struct sk_buff *skb; skb = nfc_llcp_allocate_snl(local, tlvs_len); if (IS_ERR(skb)) return PTR_ERR(skb); hlist_for_each_entry_safe(sdp, n, tlv_list, node) { skb_put_data(skb, sdp->tlv, sdp->tlv_len); hlist_del(&sdp->node); nfc_llcp_free_sdp_tlv(sdp); } skb_queue_tail(&local->tx_queue, skb); return 0; } int nfc_llcp_send_snl_sdreq(struct nfc_llcp_local *local, struct hlist_head *tlv_list, size_t tlvs_len) { struct nfc_llcp_sdp_tlv *sdreq; struct hlist_node *n; struct sk_buff *skb; skb = nfc_llcp_allocate_snl(local, tlvs_len); if (IS_ERR(skb)) return PTR_ERR(skb); mutex_lock(&local->sdreq_lock); if (hlist_empty(&local->pending_sdreqs)) mod_timer(&local->sdreq_timer, jiffies + msecs_to_jiffies(3 * local->remote_lto)); hlist_for_each_entry_safe(sdreq, n, tlv_list, node) { pr_debug("tid %d for %s\n", sdreq->tid, sdreq->uri); skb_put_data(skb, sdreq->tlv, sdreq->tlv_len); hlist_del(&sdreq->node); hlist_add_head(&sdreq->node, &local->pending_sdreqs); } mutex_unlock(&local->sdreq_lock); skb_queue_tail(&local->tx_queue, skb); return 0; } int nfc_llcp_send_dm(struct nfc_llcp_local *local, u8 ssap, u8 dsap, u8 reason) { struct sk_buff *skb; struct nfc_dev *dev; u16 size = 1; /* Reason code */ pr_debug("Sending DM reason 0x%x\n", reason); if (local == NULL) return -ENODEV; dev = local->dev; if (dev == NULL) return -ENODEV; size += LLCP_HEADER_SIZE; size += dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = alloc_skb(size, GFP_KERNEL); if (skb == NULL) return -ENOMEM; skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); skb = llcp_add_header(skb, dsap, ssap, LLCP_PDU_DM); skb_put_data(skb, &reason, 1); skb_queue_head(&local->tx_queue, skb); return 0; } int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock, struct msghdr *msg, size_t len) { struct sk_buff *pdu; struct sock *sk = &sock->sk; struct nfc_llcp_local *local; size_t frag_len = 0, remaining_len; u8 *msg_data, *msg_ptr; u16 remote_miu; pr_debug("Send I frame len %zd\n", len); local = sock->local; if (local == NULL) return -ENODEV; /* Remote is ready but has not acknowledged our frames */ if((sock->remote_ready && skb_queue_len(&sock->tx_pending_queue) >= sock->remote_rw && skb_queue_len(&sock->tx_queue) >= 2 * sock->remote_rw)) { pr_err("Pending queue is full %d frames\n", skb_queue_len(&sock->tx_pending_queue)); return -ENOBUFS; } /* Remote is not ready and we've been queueing enough frames */ if ((!sock->remote_ready && skb_queue_len(&sock->tx_queue) >= 2 * sock->remote_rw)) { pr_err("Tx queue is full %d frames\n", skb_queue_len(&sock->tx_queue)); return -ENOBUFS; } msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN); if (msg_data == NULL) return -ENOMEM; if (memcpy_from_msg(msg_data, msg, len)) { kfree(msg_data); return -EFAULT; } remaining_len = len; msg_ptr = msg_data; do { remote_miu = sock->remote_miu > LLCP_MAX_MIU ? LLCP_DEFAULT_MIU : sock->remote_miu; frag_len = min_t(size_t, remote_miu, remaining_len); pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); pdu = llcp_allocate_pdu(sock, LLCP_PDU_I, frag_len + LLCP_SEQUENCE_SIZE); if (pdu == NULL) { kfree(msg_data); return -ENOMEM; } skb_put(pdu, LLCP_SEQUENCE_SIZE); if (likely(frag_len > 0)) skb_put_data(pdu, msg_ptr, frag_len); skb_queue_tail(&sock->tx_queue, pdu); lock_sock(sk); nfc_llcp_queue_i_frames(sock); release_sock(sk); remaining_len -= frag_len; msg_ptr += frag_len; } while (remaining_len > 0); kfree(msg_data); return len; } int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, struct msghdr *msg, size_t len) { struct sk_buff *pdu; struct nfc_llcp_local *local; size_t frag_len = 0, remaining_len; u8 *msg_ptr, *msg_data; u16 remote_miu; int err; pr_debug("Send UI frame len %zd\n", len); local = sock->local; if (local == NULL) return -ENODEV; msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN); if (msg_data == NULL) return -ENOMEM; if (memcpy_from_msg(msg_data, msg, len)) { kfree(msg_data); return -EFAULT; } remaining_len = len; msg_ptr = msg_data; do { remote_miu = sock->remote_miu > LLCP_MAX_MIU ? local->remote_miu : sock->remote_miu; frag_len = min_t(size_t, remote_miu, remaining_len); pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, 0, frag_len + LLCP_HEADER_SIZE, &err); if (pdu == NULL) { pr_err("Could not allocate PDU (error=%d)\n", err); len -= remaining_len; if (len == 0) len = err; break; } pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI); if (likely(frag_len > 0)) skb_put_data(pdu, msg_ptr, frag_len); /* No need to check for the peer RW for UI frames */ skb_queue_tail(&local->tx_queue, pdu); remaining_len -= frag_len; msg_ptr += frag_len; } while (remaining_len > 0); kfree(msg_data); return len; } int nfc_llcp_send_rr(struct nfc_llcp_sock *sock) { struct sk_buff *skb; struct nfc_llcp_local *local; pr_debug("Send rr nr %d\n", sock->recv_n); local = sock->local; if (local == NULL) return -ENODEV; skb = llcp_allocate_pdu(sock, LLCP_PDU_RR, LLCP_SEQUENCE_SIZE); if (skb == NULL) return -ENOMEM; skb_put(skb, LLCP_SEQUENCE_SIZE); skb->data[2] = sock->recv_n; skb_queue_head(&local->tx_queue, skb); return 0; }
106 670 325 328 1 11 2 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/namespace.c * Copyright (C) 2006 Pavel Emelyanov <xemul@openvz.org> OpenVZ, SWsoft Inc. */ #include <linux/ipc.h> #include <linux/msg.h> #include <linux/ipc_namespace.h> #include <linux/rcupdate.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> #include "util.h" /* * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. */ static void free_ipc(struct work_struct *unused); static DECLARE_WORK(free_ipc_work, free_ipc); static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); } static void dec_ipc_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES); } static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; again: ucounts = inc_ipc_namespaces(user_ns); if (!ucounts) { /* * IPC namespaces are freed asynchronously, by free_ipc_work. * If frees were pending, flush_work will wait, and * return true. Fail the allocation if no frees are pending. */ if (flush_work(&free_ipc_work)) goto again; goto fail; } err = -ENOMEM; ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free; ns->ns.ops = &ipcns_operations; refcount_set(&ns->ns.count, 1); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; err = mq_init_ns(ns); if (err) goto fail_put; err = -ENOMEM; if (!setup_mq_sysctls(ns)) goto fail_put; if (!setup_ipc_sysctls(ns)) goto fail_mq; err = msg_init_ns(ns); if (err) goto fail_put; sem_init_ns(ns); shm_init_ns(ns); return ns; fail_mq: retire_mq_sysctls(ns); fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); fail_free: kfree(ns); fail_dec: dec_ipc_namespaces(ucounts); fail: return ERR_PTR(err); } struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) return get_ipc_ns(ns); return create_ipc_ns(user_ns, ns); } /* * free_ipcs - free all ipcs of one type * @ns: the namespace to remove the ipcs from * @ids: the table of ipcs to free * @free: the function called to free each individual ipc * * Called for each kind of ipc when an ipc_namespace exits. */ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, void (*free)(struct ipc_namespace *, struct kern_ipc_perm *)) { struct kern_ipc_perm *perm; int next_id; int total, in_use; down_write(&ids->rwsem); in_use = ids->in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; rcu_read_lock(); ipc_lock_object(perm); free(ns, perm); total++; } up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) { /* * Caller needs to wait for an RCU grace period to have passed * after making the mount point inaccessible to new accesses. */ mntput(ns->mq_mnt); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); retire_mq_sysctls(ns); retire_ipc_sysctls(ns); dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); } static LLIST_HEAD(free_ipc_list); static void free_ipc(struct work_struct *unused) { struct llist_node *node = llist_del_all(&free_ipc_list); struct ipc_namespace *n, *t; llist_for_each_entry_safe(n, t, node, mnt_llist) mnt_make_shortterm(n->mq_mnt); /* Wait for any last users to have gone away. */ synchronize_rcu(); llist_for_each_entry_safe(n, t, node, mnt_llist) free_ipc_ns(n); } /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put * * If this is the last task in the namespace exiting, and * it is dropping the refcount to 0, then it can race with * a task in another ipc namespace but in a mounts namespace * which has this ipcns's mqueuefs mounted, doing some action * with one of the mqueuefs files. That can raise the refcount. * So dropping the refcount, and raising the refcount when * accessing it through the VFS, are protected with mq_lock. * * (Clearly, a task raising the refcount on its own ipc_ns * needn't take mq_lock since it can't race with the last task * in the ipcns exiting). */ void put_ipc_ns(struct ipc_namespace *ns) { if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); } } static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) { return container_of(ns, struct ipc_namespace, ns); } static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) ns = get_ipc_ns(nsproxy->ipc_ns); task_unlock(task); return ns ? &ns->ns : NULL; } static void ipcns_put(struct ns_common *ns) { return put_ipc_ns(to_ipc_ns(ns)); } static int ipcns_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; } static struct user_namespace *ipcns_owner(struct ns_common *ns) { return to_ipc_ns(ns)->user_ns; } const struct proc_ns_operations ipcns_operations = { .name = "ipc", .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, .owner = ipcns_owner, };
1 1 1 1 1 1 1 1 1 3 19 4 4 8 5 4 9 6 1 5 6 6 1 11 10 10 10 2 2 2 10 2 2 3 3 13 12 4 3 10 3 10 6 6 3 1 2 10 10 19 19 1 19 12 12 12 12 12 19 12 19 12 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170