19 19 19 19 19 4 4 4 4 4 4 4 4 4 4 4 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 | /* * llc_sap.c - driver routines for SAP component. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <net/llc.h> #include <net/llc_if.h> #include <net/llc_conn.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #include <net/llc_s_ac.h> #include <net/llc_s_ev.h> #include <net/llc_s_st.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/llc.h> #include <linux/slab.h> static int llc_mac_header_len(unsigned short devtype) { switch (devtype) { case ARPHRD_ETHER: case ARPHRD_LOOPBACK: return sizeof(struct ethhdr); } return 0; } /** * llc_alloc_frame - allocates sk_buff for frame * @dev: network device this skb will be sent over * @type: pdu type to allocate * @data_size: data size to allocate * * Allocates an sk_buff for frame and initializes sk_buff fields. * Returns allocated skb or %NULL when out of memory. */ struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev, u8 type, u32 data_size) { int hlen = type == LLC_PDU_TYPE_U ? 3 : 4; struct sk_buff *skb; hlen += llc_mac_header_len(dev->type); skb = alloc_skb(hlen + data_size, GFP_ATOMIC); if (skb) { skb_reset_mac_header(skb); skb_reserve(skb, hlen); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->protocol = htons(ETH_P_802_2); skb->dev = dev; if (sk != NULL) skb_set_owner_w(skb, sk); } return skb; } void llc_save_primitive(struct sock *sk, struct sk_buff *skb, u8 prim) { struct sockaddr_llc *addr; /* save primitive for use by the user. */ addr = llc_ui_skb_cb(skb); memset(addr, 0, sizeof(*addr)); addr->sllc_family = sk->sk_family; addr->sllc_arphrd = skb->dev->type; addr->sllc_test = prim == LLC_TEST_PRIM; addr->sllc_xid = prim == LLC_XID_PRIM; addr->sllc_ua = prim == LLC_DATAUNIT_PRIM; llc_pdu_decode_sa(skb, addr->sllc_mac); llc_pdu_decode_ssap(skb, &addr->sllc_sap); } /** * llc_sap_rtn_pdu - Informs upper layer on rx of an UI, XID or TEST pdu. * @sap: pointer to SAP * @skb: received pdu */ void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); switch (LLC_U_PDU_RSP(pdu)) { case LLC_1_PDU_CMD_TEST: ev->prim = LLC_TEST_PRIM; break; case LLC_1_PDU_CMD_XID: ev->prim = LLC_XID_PRIM; break; case LLC_1_PDU_CMD_UI: ev->prim = LLC_DATAUNIT_PRIM; break; } ev->ind_cfm_flag = LLC_IND; } /** * llc_find_sap_trans - finds transition for event * @sap: pointer to SAP * @skb: happened event * * This function finds transition that matches with happened event. * Returns the pointer to found transition on success or %NULL for * failure. */ static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap, struct sk_buff *skb) { int i = 0; struct llc_sap_state_trans *rc = NULL; struct llc_sap_state_trans **next_trans; struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1]; /* * Search thru events for this state until list exhausted or until * its obvious the event is not valid for the current state */ for (next_trans = curr_state->transitions; next_trans[i]->ev; i++) if (!next_trans[i]->ev(sap, skb)) { rc = next_trans[i]; /* got event match; return it */ break; } return rc; } /** * llc_exec_sap_trans_actions - execute actions related to event * @sap: pointer to SAP * @trans: pointer to transition that it's actions must be performed * @skb: happened event. * * This function executes actions that is related to happened event. * Returns 0 for success and 1 for failure of at least one action. */ static int llc_exec_sap_trans_actions(struct llc_sap *sap, struct llc_sap_state_trans *trans, struct sk_buff *skb) { int rc = 0; const llc_sap_action_t *next_action = trans->ev_actions; for (; next_action && *next_action; next_action++) if ((*next_action)(sap, skb)) rc = 1; return rc; } /** * llc_sap_next_state - finds transition, execs actions & change SAP state * @sap: pointer to SAP * @skb: happened event * * This function finds transition that matches with happened event, then * executes related actions and finally changes state of SAP. It returns * 0 on success and 1 for failure. */ static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb) { int rc = 1; struct llc_sap_state_trans *trans; if (sap->state > LLC_NR_SAP_STATES) goto out; trans = llc_find_sap_trans(sap, skb); if (!trans) goto out; /* * Got the state to which we next transition; perform the actions * associated with this transition before actually transitioning to the * next state */ rc = llc_exec_sap_trans_actions(sap, trans, skb); if (rc) goto out; /* * Transition SAP to next state if all actions execute successfully */ sap->state = trans->next_state; out: return rc; } /** * llc_sap_state_process - sends event to SAP state machine * @sap: sap to use * @skb: pointer to occurred event * * After executing actions of the event, upper layer will be indicated * if needed(on receiving an UI frame). sk can be null for the * datalink_proto case. * * This function always consumes a reference to the skb. */ static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->ind_cfm_flag = 0; llc_sap_next_state(sap, skb); if (ev->ind_cfm_flag == LLC_IND && skb->sk->sk_state != TCP_LISTEN) { llc_save_primitive(skb->sk, skb, ev->prim); /* queue skb to the user. */ if (sock_queue_rcv_skb(skb->sk, skb) == 0) return; } kfree_skb(skb); } /** * llc_build_and_send_test_pkt - TEST interface for upper layers. * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * This function is called when upper layer wants to send a TEST pdu. * Returns 0 for success, 1 otherwise. */ void llc_build_and_send_test_pkt(struct llc_sap *sap, struct sk_buff *skb, u8 *dmac, u8 dsap) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->saddr.lsap = sap->laddr.lsap; ev->daddr.lsap = dsap; memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); ev->type = LLC_SAP_EV_TYPE_PRIM; ev->prim = LLC_TEST_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; llc_sap_state_process(sap, skb); } /** * llc_build_and_send_xid_pkt - XID interface for upper layers * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * This function is called when upper layer wants to send a XID pdu. * Returns 0 for success, 1 otherwise. */ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, u8 *dmac, u8 dsap) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->saddr.lsap = sap->laddr.lsap; ev->daddr.lsap = dsap; memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); ev->type = LLC_SAP_EV_TYPE_PRIM; ev->prim = LLC_XID_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; llc_sap_state_process(sap, skb); } /** * llc_sap_rcv - sends received pdus to the sap state machine * @sap: current sap component structure. * @skb: received frame. * * Sends received pdus to the sap state machine. */ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, struct sock *sk) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->type = LLC_SAP_EV_TYPE_PDU; ev->reason = 0; skb_orphan(skb); sock_hold(sk); skb->sk = sk; skb->destructor = sock_efree; llc_sap_state_process(sap, skb); } static inline bool llc_dgram_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sock *sk) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && llc->laddr.lsap == laddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac); } /** * llc_lookup_dgram - Finds dgram socket for the local sap/mac * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * * Search socket list of the SAP and finds connection using the local * mac, and local sap. Returns pointer for socket found, %NULL otherwise. */ static struct sock *llc_lookup_dgram(struct llc_sap *sap, const struct llc_addr *laddr) { struct sock *rc; struct hlist_nulls_node *node; int slot = llc_sk_laddr_hashfn(sap, laddr); struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; rcu_read_lock_bh(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_dgram_match(sap, laddr, rc)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || !llc_dgram_match(sap, laddr, rc))) { sock_put(rc); continue; } goto found; } } rc = NULL; /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (unlikely(get_nulls_value(node) != slot)) goto again; found: rcu_read_unlock_bh(); return rc; } static inline bool llc_mcast_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sk_buff *skb, const struct sock *sk) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && llc->laddr.lsap == laddr->lsap && llc->dev == skb->dev; } static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb, struct sock **stack, int count) { struct sk_buff *skb1; int i; for (i = 0; i < count; i++) { skb1 = skb_clone(skb, GFP_ATOMIC); if (!skb1) { sock_put(stack[i]); continue; } llc_sap_rcv(sap, skb1, stack[i]); sock_put(stack[i]); } } /** * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * * Search socket list of the SAP and finds connections with same sap. * Deliver clone to each. */ static void llc_sap_mcast(struct llc_sap *sap, const struct llc_addr *laddr, struct sk_buff *skb) { int i = 0; struct sock *sk; struct sock *stack[256 / sizeof(struct sock *)]; struct llc_sock *llc; struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex); spin_lock_bh(&sap->sk_lock); hlist_for_each_entry(llc, dev_hb, dev_hash_node) { sk = &llc->sk; if (!llc_mcast_match(sap, laddr, skb, sk)) continue; sock_hold(sk); if (i < ARRAY_SIZE(stack)) stack[i++] = sk; else { llc_do_mcast(sap, skb, stack, i); i = 0; } } spin_unlock_bh(&sap->sk_lock); llc_do_mcast(sap, skb, stack, i); } void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr laddr; llc_pdu_decode_da(skb, laddr.mac); llc_pdu_decode_dsap(skb, &laddr.lsap); if (is_multicast_ether_addr(laddr.mac)) { llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); } else { struct sock *sk = llc_lookup_dgram(sap, &laddr); if (sk) { llc_sap_rcv(sap, skb, sk); sock_put(sk); } else kfree_skb(skb); } } |
89 30 38 33 30 51 211 18 11 18 38 33 33 7 27 38 31 30 30 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | /* * Pluggable TCP upper layer protocol support. * * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * */ #include<linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <net/tcp.h> static DEFINE_SPINLOCK(tcp_ulp_list_lock); static LIST_HEAD(tcp_ulp_list); /* Simple linear search, don't expect many entries! */ static struct tcp_ulp_ops *tcp_ulp_find(const char *name) { struct tcp_ulp_ops *e; list_for_each_entry_rcu(e, &tcp_ulp_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp) { struct tcp_ulp_ops *e; list_for_each_entry_rcu(e, &tcp_ulp_list, list) { if (e->uid == ulp) return e; } return NULL; } static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) { const struct tcp_ulp_ops *ulp = NULL; rcu_read_lock(); ulp = tcp_ulp_find(name); #ifdef CONFIG_MODULES if (!ulp && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp-ulp-%s", name); rcu_read_lock(); ulp = tcp_ulp_find(name); } #endif if (!ulp || !try_module_get(ulp->owner)) ulp = NULL; rcu_read_unlock(); return ulp; } static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid) { const struct tcp_ulp_ops *ulp; rcu_read_lock(); ulp = tcp_ulp_find_id(uid); if (!ulp || !try_module_get(ulp->owner)) ulp = NULL; rcu_read_unlock(); return ulp; } /* Attach new upper layer protocol to the list * of available protocols. */ int tcp_register_ulp(struct tcp_ulp_ops *ulp) { int ret = 0; spin_lock(&tcp_ulp_list_lock); if (tcp_ulp_find(ulp->name)) ret = -EEXIST; else list_add_tail_rcu(&ulp->list, &tcp_ulp_list); spin_unlock(&tcp_ulp_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_ulp); void tcp_unregister_ulp(struct tcp_ulp_ops *ulp) { spin_lock(&tcp_ulp_list_lock); list_del_rcu(&ulp->list); spin_unlock(&tcp_ulp_list_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_ulp); /* Build string with list of available upper layer protocl values */ void tcp_get_available_ulp(char *buf, size_t maxlen) { struct tcp_ulp_ops *ulp_ops; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ulp_ops->name); } rcu_read_unlock(); } void tcp_cleanup_ulp(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (!icsk->icsk_ulp_ops) return; if (icsk->icsk_ulp_ops->release) icsk->icsk_ulp_ops->release(sk); module_put(icsk->icsk_ulp_ops->owner); icsk->icsk_ulp_ops = NULL; } /* Change upper layer protocol for socket */ int tcp_set_ulp(struct sock *sk, const char *name) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_ulp_ops *ulp_ops; int err = 0; if (icsk->icsk_ulp_ops) return -EEXIST; ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) return -ENOENT; if (!ulp_ops->user_visible) { module_put(ulp_ops->owner); return -ENOENT; } err = ulp_ops->init(sk); if (err) { module_put(ulp_ops->owner); return err; } icsk->icsk_ulp_ops = ulp_ops; return 0; } int tcp_set_ulp_id(struct sock *sk, int ulp) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_ulp_ops *ulp_ops; int err; if (icsk->icsk_ulp_ops) return -EEXIST; ulp_ops = __tcp_ulp_lookup(ulp); if (!ulp_ops) return -ENOENT; err = ulp_ops->init(sk); if (err) { module_put(ulp_ops->owner); return err; } icsk->icsk_ulp_ops = ulp_ops; return 0; } |
5 5 5 5 5 5 5 6 5 6 6 6 7 6 7 684 684 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | /* * This file implement the Wireless Extensions proc API. * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. * * (As all part of the Linux kernel, this file is GPL) */ /* * The /proc/net/wireless file is a human readable user-space interface * exporting various wireless specific statistics from the wireless devices. * This is the most popular part of the Wireless Extensions ;-) * * This interface is a pure clone of /proc/net/dev (in net/core/dev.c). * The content of the file is basically the content of "struct iw_statistics". */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/wireless.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <net/iw_handler.h> #include <net/wext.h> static void wireless_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { /* Get stats from the driver */ struct iw_statistics *stats = get_wireless_stats(dev); static struct iw_statistics nullstats = {}; /* show device if it's wireless regardless of current stats */ if (!stats) { #ifdef CONFIG_WIRELESS_EXT if (dev->wireless_handlers) stats = &nullstats; #endif #ifdef CONFIG_CFG80211 if (dev->ieee80211_ptr) stats = &nullstats; #endif } if (stats) { seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d " "%6d %6d %6d\n", dev->name, stats->status, stats->qual.qual, stats->qual.updated & IW_QUAL_QUAL_UPDATED ? '.' : ' ', ((__s32) stats->qual.level) - ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), stats->qual.updated & IW_QUAL_LEVEL_UPDATED ? '.' : ' ', ((__s32) stats->qual.noise) - ((stats->qual.updated & IW_QUAL_DBM) ? 0x100 : 0), stats->qual.updated & IW_QUAL_NOISE_UPDATED ? '.' : ' ', stats->discard.nwid, stats->discard.code, stats->discard.fragment, stats->discard.retries, stats->discard.misc, stats->miss.beacon); if (stats != &nullstats) stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; } } /* ---------------------------------------------------------------- */ /* * Print info for /proc/net/wireless (print all entries) */ static int wireless_dev_seq_show(struct seq_file *seq, void *v) { might_sleep(); if (v == SEQ_START_TOKEN) seq_printf(seq, "Inter-| sta-| Quality | Discarded " "packets | Missed | WE\n" " face | tus | link level noise | nwid " "crypt frag retry misc | beacon | %d\n", WIRELESS_EXT); else wireless_seq_printf_stats(seq, v); return 0; } static void *wireless_dev_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); loff_t off; struct net_device *dev; rtnl_lock(); if (!*pos) return SEQ_START_TOKEN; off = 1; for_each_netdev(net, dev) if (off++ == *pos) return dev; return NULL; } static void *wireless_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); ++*pos; return v == SEQ_START_TOKEN ? first_net_device(net) : next_net_device(v); } static void wireless_dev_seq_stop(struct seq_file *seq, void *v) { rtnl_unlock(); } static const struct seq_operations wireless_seq_ops = { .start = wireless_dev_seq_start, .next = wireless_dev_seq_next, .stop = wireless_dev_seq_stop, .show = wireless_dev_seq_show, }; int __net_init wext_proc_init(struct net *net) { /* Create /proc/net/wireless entry */ if (!proc_create_net("wireless", 0444, net->proc_net, &wireless_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; return 0; } void __net_exit wext_proc_exit(struct net *net) { remove_proc_entry("wireless", net->proc_net); } |
38 36 34 32 31 31 31 31 9 7 7 7 7 30 2 28 17 3 6 6 3 6 17 30 26 12 12 12 12 8 8 12 2 12 1 12 2 2 3 5 5 4 1 1 3 3 3 3 2 1 7 7 9 45 43 39 12 12 9 11 11 4 7 9 9 7 8 36 36 27 9 36 36 36 22 16 35 3 2 3 35 20 33 35 33 35 22 35 19 1 35 30 11 31 12 31 30 35 36 10 27 35 4 34 30 4 4 3 2 1 2 28 28 7 7 21 28 21 4 3 4 3 4 24 25 3 3 2 23 6 6 1 5 23 5 25 4 4 20 20 24 25 4 4 4 115 113 110 110 108 45 45 44 40 6 6 2 7 8 104 115 6 1 1 6 6 60 16 16 2442 36 21 20 2 20 4 1 1 3 3 20 4 20 8 20 8 25 23 11 10 9 13 22 22 22 24 25 11 5 7 1 7 1 7 11 306 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 | /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/hrtimer.h> #include <linux/list.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/bcm.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> /* * To send multiple CAN frame content within TX_SETUP or to filter * CAN messages with multiplex index within RX_SETUP, the number of * different filters is limited to 256 due to the one byte index value. */ #define MAX_NFRAMES 256 /* limit timers to 400 days for sending/timeouts */ #define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60) /* use of last_frames[index].flags */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ #define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) #define CAN_BCM_VERSION "20170425" MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS("can-proto-2"); /* * easy access to the first 64 bit of can(fd)_frame payload. cp->data is * 64 bit aligned so the offset has to be multiples of 8 which is ensured * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler(). */ static inline u64 get_u64(const struct canfd_frame *cp, int offset) { return *(u64 *)(cp->data + offset); } struct bcm_op { struct list_head list; struct rcu_head rcu; int ifindex; canid_t can_id; u32 flags; unsigned long frames_abs, frames_filtered; struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; int cfsiz; u32 count; u32 nframes; u32 currframe; /* void pointers to arrays of struct can[fd]_frame */ void *frames; void *last_frames; struct canfd_frame sframe; struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; }; struct bcm_sock { struct sock sk; int bound; int ifindex; struct list_head notifier; struct list_head rx_ops; struct list_head tx_ops; unsigned long dropped_usr_msgs; struct proc_dir_entry *bcm_proc_read; char procname [32]; /* inode number in decimal with \0 */ }; static LIST_HEAD(bcm_notifier_list); static DEFINE_SPINLOCK(bcm_notifier_lock); static struct bcm_sock *bcm_busy_notifier; static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; } static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) { return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); } /* check limitations for timeval provided by user */ static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head) { if ((msg_head->ival1.tv_sec < 0) || (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival1.tv_usec < 0) || (msg_head->ival1.tv_usec >= USEC_PER_SEC) || (msg_head->ival2.tv_sec < 0) || (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival2.tv_usec < 0) || (msg_head->ival2.tv_usec >= USEC_PER_SEC)) return true; return false; } #define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) /* * procfs functions */ #if IS_ENABLED(CONFIG_PROC_FS) static char *bcm_proc_getifname(struct net *net, char *result, int ifindex) { struct net_device *dev; if (!ifindex) return "any"; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) strcpy(result, dev->name); else strcpy(result, "???"); rcu_read_unlock(); return result; } static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; struct sock *sk = (struct sock *)PDE_DATA(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; seq_printf(m, ">>> socket %pK", sk->sk_socket); seq_printf(m, " / sk %pK", sk); seq_printf(m, " / bo %pK", bo); seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs); seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); list_for_each_entry(op, &bo->rx_ops, list) { unsigned long reduction; /* print only active entries & prevent division by zero */ if (!op->frames_abs) continue; seq_printf(m, "rx_op: %03X %-5s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u)", op->nframes); else seq_printf(m, "[%u]", op->nframes); seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' '); if (op->kt_ival1) seq_printf(m, "timeo=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "thr=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# recv %ld (%ld) => reduction: ", op->frames_filtered, op->frames_abs); reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; seq_printf(m, "%s%ld%%\n", (reduction == 100) ? "near " : "", reduction); } list_for_each_entry(op, &bo->tx_ops, list) { seq_printf(m, "tx_op: %03X %s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u) ", op->nframes); else seq_printf(m, "[%u] ", op->nframes); if (op->kt_ival1) seq_printf(m, "t1=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "t2=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# sent %ld\n", op->frames_abs); } seq_putc(m, '\n'); return 0; } #endif /* CONFIG_PROC_FS */ /* * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface * of the given bcm tx op */ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; int err; /* no target device? => exit */ if (!op->ifindex) return; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ return; } skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); if (!skb) goto out; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_put_data(skb, cf, op->cfsiz); /* send with loopback */ skb->dev = dev; can_skb_set_owner(skb, op->sk); err = can_send(skb, 1); if (!err) op->frames_abs++; op->currframe++; /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; out: dev_put(dev); } /* * bcm_send_to_user - send a BCM message to the userspace * (consisting of bcm_msg_head + x CAN frames) */ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, struct canfd_frame *frames, int has_timestamp) { struct sk_buff *skb; struct canfd_frame *firstframe; struct sockaddr_can *addr; struct sock *sk = op->sk; unsigned int datalen = head->nframes * op->cfsiz; int err; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); if (!skb) return; skb_put_data(skb, head, sizeof(*head)); if (head->nframes) { /* CAN frames starting here */ firstframe = (struct canfd_frame *)skb_tail_pointer(skb); skb_put_data(skb, frames, datalen); /* * the BCM uses the flags-element of the canfd_frame * structure for internal purposes. This is only * relevant for updates that are generated by the * BCM, where nframes is 1 */ if (head->nframes == 1) firstframe->flags &= BCM_CAN_FLAGS_MASK; } if (has_timestamp) { /* restore rx timestamp */ skb->tstamp = op->rx_stamp; } /* * Put the datagram to the queue so that bcm_recvmsg() can * get it from there. We need to pass the interface index to * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb * containing the interface index. */ sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = op->rx_ifindex; err = sock_queue_rcv_skb(sk, skb); if (err < 0) { struct bcm_sock *bo = bcm_sk(sk); kfree_skb(skb); /* don't care about overflows in this statistic */ bo->dropped_usr_msgs++; } } static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt) { ktime_t ival; if (op->kt_ival1 && op->count) ival = op->kt_ival1; else if (op->kt_ival2) ival = op->kt_ival2; else return false; hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival)); return true; } static void bcm_tx_start_timer(struct bcm_op *op) { if (bcm_tx_set_expiry(op, &op->timer)) hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT); } /* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { op->count--; if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = TX_EXPIRED; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); } bcm_can_tx(op); } else if (op->kt_ival2) { bcm_can_tx(op); } return bcm_tx_set_expiry(op, &op->timer) ? HRTIMER_RESTART : HRTIMER_NORESTART; } /* * bcm_rx_changed - create a RX_CHANGED notification due to changed content */ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) { struct bcm_msg_head head; /* update statistics */ op->frames_filtered++; /* prevent statistics overflow */ if (op->frames_filtered > ULONG_MAX/100) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV); memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; head.flags = op->flags; head.count = op->count; head.ival1 = op->ival1; head.ival2 = op->ival2; head.can_id = op->can_id; head.nframes = 1; bcm_send_to_user(op, &head, data, 1); } /* * bcm_rx_update_and_send - process a detected relevant receive content change * 1. update the last received data * 2. send a notification to the user (if possible) */ static void bcm_rx_update_and_send(struct bcm_op *op, struct canfd_frame *lastdata, const struct canfd_frame *rxdata) { memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ lastdata->flags |= (RX_RECV|RX_THR); /* throttling mode inactive ? */ if (!op->kt_ival2) { /* send RX_CHANGED to the user immediately */ bcm_rx_changed(op, lastdata); return; } /* with active throttling timer we are just done here */ if (hrtimer_active(&op->thrtimer)) return; /* first reception with enabled throttling mode */ if (!op->kt_lastmsg) goto rx_changed_settime; /* got a second frame inside a potential throttle period? */ if (ktime_us_delta(ktime_get(), op->kt_lastmsg) < ktime_to_us(op->kt_ival2)) { /* do not send the saved data - only start throttle timer */ hrtimer_start(&op->thrtimer, ktime_add(op->kt_lastmsg, op->kt_ival2), HRTIMER_MODE_ABS_SOFT); return; } /* the gap was that big, that throttling was not needed here */ rx_changed_settime: bcm_rx_changed(op, lastdata); op->kt_lastmsg = ktime_get(); } /* * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, const struct canfd_frame *rxdata) { struct canfd_frame *cf = op->frames + op->cfsiz * index; struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; int i; /* * no one uses the MSBs of flags for comparison, * so we use it here to detect the first time of reception */ if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ bcm_rx_update_and_send(op, lcf, rxdata); return; } /* do a real check in CAN frame data section */ for (i = 0; i < rxdata->len; i += 8) { if ((get_u64(cf, i) & get_u64(rxdata, i)) != (get_u64(cf, i) & get_u64(lcf, i))) { bcm_rx_update_and_send(op, lcf, rxdata); return; } } if (op->flags & RX_CHECK_DLC) { /* do a real check in CAN frame length */ if (rxdata->len != lcf->len) { bcm_rx_update_and_send(op, lcf, rxdata); return; } } } /* * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception */ static void bcm_rx_starttimer(struct bcm_op *op) { if (op->flags & RX_NO_AUTOTIMER) return; if (op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } /* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); struct bcm_msg_head msg_head; /* if user wants to be informed, when cyclic CAN-Messages come back */ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { /* clear received CAN frames to indicate 'nothing received' */ memset(op->last_frames, 0, op->nframes * op->cfsiz); } /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); return HRTIMER_NORESTART; } /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index) { struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; if ((op->last_frames) && (lcf->flags & RX_THR)) { bcm_rx_changed(op, lcf); return 1; } return 0; } /* * bcm_rx_thr_flush - Check for throttled data and send it to the userspace */ static int bcm_rx_thr_flush(struct bcm_op *op) { int updated = 0; if (op->nframes > 1) { unsigned int i; /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) updated += bcm_rx_do_flush(op, i); } else { /* for RX_FILTER_ID and simple filter */ updated += bcm_rx_do_flush(op, 0); } return updated; } /* * bcm_rx_thr_handler - the time for blocked content updates is over now: * Check for throttled data and send it to the userspace */ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); if (bcm_rx_thr_flush(op)) { hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2); return HRTIMER_RESTART; } else { /* rearm throttle handling */ op->kt_lastmsg = 0; return HRTIMER_NORESTART; } } /* * bcm_rx_handler - handle a CAN frame reception */ static void bcm_rx_handler(struct sk_buff *skb, void *data) { struct bcm_op *op = (struct bcm_op *)data; const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; if (op->can_id != rxframe->can_id) return; /* make sure to handle the correct frame type (CAN / CAN FD) */ if (skb->len != op->cfsiz) return; /* disable timeout */ hrtimer_cancel(&op->timer); /* save rx timestamp */ op->rx_stamp = skb->tstamp; /* save originator for recvfrom() */ op->rx_ifindex = skb->dev->ifindex; /* update statistics */ op->frames_abs++; if (op->flags & RX_RTR_FRAME) { /* send reply for RTR-request (placed in op->frames[0]) */ bcm_can_tx(op); return; } if (op->flags & RX_FILTER_ID) { /* the easiest case */ bcm_rx_update_and_send(op, op->last_frames, rxframe); goto rx_starttimer; } if (op->nframes == 1) { /* simple compare with index 0 */ bcm_rx_cmp_to_index(op, 0, rxframe); goto rx_starttimer; } if (op->nframes > 1) { /* * multiplex compare * * find the first multiplex mask that fits. * Remark: The MUX-mask is stored in index 0 - but only the * first 64 bits of the frame data[] are relevant (CAN FD) */ for (i = 1; i < op->nframes; i++) { if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == (get_u64(op->frames, 0) & get_u64(op->frames + op->cfsiz * i, 0))) { bcm_rx_cmp_to_index(op, i, rxframe); break; } } } rx_starttimer: bcm_rx_starttimer(op); } /* * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements */ static struct bcm_op *bcm_find_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op; list_for_each_entry(op, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) return op; } return NULL; } static void bcm_free_op_rcu(struct rcu_head *rcu_head) { struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu); if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); if ((op->last_frames) && (op->last_frames != &op->last_sframe)) kfree(op->last_frames); kfree(op); } static void bcm_remove_op(struct bcm_op *op) { hrtimer_cancel(&op->timer); hrtimer_cancel(&op->thrtimer); call_rcu(&op->rcu, bcm_free_op_rcu); } static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) { if (op->rx_reg_dev == dev) { can_rx_unregister(dev_net(dev), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); /* mark as removed subscription */ op->rx_reg_dev = NULL; } else printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device " "mismatch %p %p\n", op->rx_reg_dev, dev); } /* * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) */ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { /* disable automatic timer on frame reception */ op->flags |= RX_NO_AUTOTIMER; /* * Don't care if we're bound or not (due to netdev * problems) can_rx_unregister() is always a save * thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(sock_net(op->sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); list_del(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) */ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { list_del(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg) */ static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, int ifindex) { struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex); if (!op) return -EINVAL; /* put current values into msg_head */ msg_head->flags = op->flags; msg_head->count = op->count; msg_head->ival1 = op->ival1; msg_head->ival2 = op->ival2; msg_head->nframes = op->nframes; bcm_send_to_user(op, msg_head, op->frames, 0); return MHSIZ; } /* * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg) */ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; struct canfd_frame *cf; unsigned int i; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; /* check nframes boundaries - we need at least one CAN frame */ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; /* update CAN frames content */ for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) return err; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } op->flags = msg_head->flags; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { op->frames = kmalloc_array(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } } else op->frames = &op->sframe; for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); kfree(op); return err; } if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } /* tx_ops never compare with previous received messages */ op->last_frames = NULL; /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_tx_timeout_handler; /* currently unused in tx_ops */ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ if (op->nframes != msg_head->nframes) { op->nframes = msg_head->nframes; /* start multiple frame transmission with index 0 */ op->currframe = 0; } /* check flags */ if (op->flags & TX_RESET_MULTI_IDX) { /* start multiple frame transmission with index 0 */ op->currframe = 0; } if (op->flags & SETTIMER) { /* set timer values */ op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero values? */ if (!op->kt_ival1 && !op->kt_ival2) hrtimer_cancel(&op->timer); } if (op->flags & STARTTIMER) { hrtimer_cancel(&op->timer); /* spec: send CAN frame when starting timer */ op->flags |= TX_ANNOUNCE; } if (op->flags & TX_ANNOUNCE) { bcm_can_tx(op); if (op->count) op->count--; } if (op->flags & STARTTIMER) bcm_tx_start_timer(op); return msg_head->nframes * op->cfsiz + MHSIZ; } /* * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg) */ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; int do_rx_register; int err = 0; if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) { /* be robust against wrong usage ... */ msg_head->flags |= RX_FILTER_ID; /* ignore trailing garbage */ msg_head->nframes = 0; } /* the first element contains the mux-mask => MAX_NFRAMES + 1 */ if (msg_head->nframes > MAX_NFRAMES + 1) return -EINVAL; if ((msg_head->flags & RX_RTR_FRAME) && ((msg_head->nframes != 1) || (!(msg_head->can_id & CAN_RTR_FLAG)))) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; if (msg_head->nframes) { /* update CAN frames content */ err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) return err; /* clear last_frames to indicate 'nothing received' */ memset(op->last_frames, 0, msg_head->nframes * op->cfsiz); } op->nframes = msg_head->nframes; op->flags = msg_head->flags; /* Only an update -> do not call can_rx_register() */ do_rx_register = 0; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->nframes = msg_head->nframes; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; if (msg_head->nframes > 1) { /* create array for CAN frames and copy the data */ op->frames = kmalloc_array(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } /* create and init array for received CAN frames */ op->last_frames = kcalloc(msg_head->nframes, op->cfsiz, GFP_KERNEL); if (!op->last_frames) { kfree(op->frames); kfree(op); return -ENOMEM; } } else { op->frames = &op->sframe; op->last_frames = &op->last_sframe; } if (msg_head->nframes) { err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); if (op->last_frames != &op->last_sframe) kfree(op->last_frames); kfree(op); return err; } } /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* ifindex for timeout events w/o previous frame reception */ op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); op->timer.function = bcm_rx_timeout_handler; hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); op->thrtimer.function = bcm_rx_thr_handler; /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); /* call can_rx_register() */ do_rx_register = 1; } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ /* check flags */ if (op->flags & RX_RTR_FRAME) { struct canfd_frame *frame0 = op->frames; /* no timers in RTR-mode */ hrtimer_cancel(&op->thrtimer); hrtimer_cancel(&op->timer); /* * funny feature in RX(!)_SETUP only for RTR-mode: * copy can_id into frame BUT without RTR-flag to * prevent a full-load-loopback-test ... ;-] */ if ((op->flags & TX_CP_CAN_ID) || (frame0->can_id == op->can_id)) frame0->can_id = op->can_id & ~CAN_RTR_FLAG; } else { if (op->flags & SETTIMER) { /* set timer value */ op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero value? */ if (!op->kt_ival1) hrtimer_cancel(&op->timer); /* * In any case cancel the throttle timer, flush * potentially blocked msgs and reset throttle handling */ op->kt_lastmsg = 0; hrtimer_cancel(&op->thrtimer); bcm_rx_thr_flush(op); } if ((op->flags & STARTTIMER) && op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); } /* now we can register for can_ids, if we added a new bcm_op */ if (do_rx_register) { if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (dev) { err = can_rx_register(sock_net(sk), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); op->rx_reg_dev = dev; dev_put(dev); } } else err = can_rx_register(sock_net(sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ list_del(&op->list); bcm_remove_op(op); return err; } } return msg_head->nframes * op->cfsiz + MHSIZ; } /* * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) */ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, int cfsiz) { struct sk_buff *skb; struct net_device *dev; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); if (!skb) return -ENOMEM; can_skb_reserve(skb); err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { kfree_skb(skb); return err; } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) { kfree_skb(skb); return -ENODEV; } can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->dev = dev; can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); if (err) return err; return cfsiz + MHSIZ; } /* * bcm_sendmsg - process BCM commands (opcodes) from the userspace */ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ struct bcm_msg_head msg_head; int cfsiz; int ret; /* read bytes or error codes as return value */ if (!bo->bound) return -ENOTCONN; /* check for valid message length from userspace */ if (size < MHSIZ) return -EINVAL; /* read message head information */ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); if (ret < 0) return ret; cfsiz = CFSIZ(msg_head.flags); if ((size - MHSIZ) % cfsiz) return -EINVAL; /* check for alternative ifindex for this bcm_op */ if (!ifindex && msg->msg_name) { /* no bound device as default => check msg_name */ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* ifindex from sendto() */ ifindex = addr->can_ifindex; if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENODEV; if (dev->type != ARPHRD_CAN) { dev_put(dev); return -ENODEV; } dev_put(dev); } } lock_sock(sk); switch (msg_head.opcode) { case TX_SETUP: ret = bcm_tx_setup(&msg_head, msg, ifindex, sk); break; case RX_SETUP: ret = bcm_rx_setup(&msg_head, msg, ifindex, sk); break; case TX_DELETE: if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case RX_DELETE: if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case TX_READ: /* reuse msg_head for the reply to TX_READ */ msg_head.opcode = TX_STATUS; ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex); break; case RX_READ: /* reuse msg_head for the reply to RX_READ */ msg_head.opcode = RX_STATUS; ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex); break; case TX_SEND: /* we need exactly one CAN frame behind the msg head */ if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ)) ret = -EINVAL; else ret = bcm_tx_send(msg, ifindex, sk, cfsiz); break; default: ret = -EINVAL; break; } release_sock(sk); return ret; } /* * notification handler for netdevice status changes */ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, struct net_device *dev) { struct sock *sk = &bo->sk; struct bcm_op *op; int notify_enodev = 0; if (!net_eq(dev_net(dev), sock_net(sk))) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove device specific receive entries */ list_for_each_entry(op, &bo->rx_ops, list) if (op->rx_reg_dev == dev) bcm_rx_unreg(dev, op); /* remove device reference, if this is our bound device */ if (bo->bound && bo->ifindex == dev->ifindex) { bo->bound = 0; bo->ifindex = 0; notify_enodev = 1; } release_sock(sk); if (notify_enodev) { sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); } break; case NETDEV_DOWN: if (bo->bound && bo->ifindex == dev->ifindex) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); } } } static int bcm_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&bcm_notifier_lock); list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) { spin_unlock(&bcm_notifier_lock); bcm_notify(bcm_busy_notifier, msg, dev); spin_lock(&bcm_notifier_lock); } bcm_busy_notifier = NULL; spin_unlock(&bcm_notifier_lock); return NOTIFY_DONE; } /* * initial settings for all BCM sockets to be set at socket creation time */ static int bcm_init(struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); bo->bound = 0; bo->ifindex = 0; bo->dropped_usr_msgs = 0; bo->bcm_proc_read = NULL; INIT_LIST_HEAD(&bo->tx_ops); INIT_LIST_HEAD(&bo->rx_ops); /* set notifier */ spin_lock(&bcm_notifier_lock); list_add_tail(&bo->notifier, &bcm_notifier_list); spin_unlock(&bcm_notifier_lock); return 0; } /* * standard socket functions */ static int bcm_release(struct socket *sock) { struct sock *sk = sock->sk; struct net *net; struct bcm_sock *bo; struct bcm_op *op, *next; if (!sk) return 0; net = sock_net(sk); bo = bcm_sk(sk); /* remove bcm_ops, timer, rx_unregister(), etc. */ spin_lock(&bcm_notifier_lock); while (bcm_busy_notifier == bo) { spin_unlock(&bcm_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&bcm_notifier_lock); } list_del(&bo->notifier); spin_unlock(&bcm_notifier_lock); lock_sock(sk); list_for_each_entry_safe(op, next, &bo->tx_ops, list) bcm_remove_op(op); list_for_each_entry_safe(op, next, &bo->rx_ops, list) { /* * Don't care if we're bound or not (due to netdev problems) * can_rx_unregister() is always a save thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(net, op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(net, NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); } synchronize_rcu(); list_for_each_entry_safe(op, next, &bo->rx_ops, list) bcm_remove_op(op); #if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ if (net->can.bcmproc_dir && bo->bcm_proc_read) remove_proc_entry(bo->procname, net->can.bcmproc_dir); #endif /* CONFIG_PROC_FS */ /* remove device reference */ if (bo->bound) { bo->bound = 0; bo->ifindex = 0; } sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); struct net *net = sock_net(sk); int ret = 0; if (len < sizeof(*addr)) return -EINVAL; lock_sock(sk); if (bo->bound) { ret = -EISCONN; goto fail; } /* bind a device to this socket */ if (addr->can_ifindex) { struct net_device *dev; dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { ret = -ENODEV; goto fail; } if (dev->type != ARPHRD_CAN) { dev_put(dev); ret = -ENODEV; goto fail; } bo->ifindex = dev->ifindex; dev_put(dev); } else { /* no interface reference for ifindex = 0 ('any' CAN device) */ bo->ifindex = 0; } #if IS_ENABLED(CONFIG_PROC_FS) if (net->can.bcmproc_dir) { /* unique socket address as filename */ sprintf(bo->procname, "%lu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644, net->can.bcmproc_dir, bcm_proc_show, sk); if (!bo->bcm_proc_read) { ret = -ENOMEM; goto fail; } } #endif /* CONFIG_PROC_FS */ bo->bound = 1; fail: release_sock(sk); return ret; } static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; int noblock; int err; noblock = flags & MSG_DONTWAIT; flags &= ~MSG_DONTWAIT; skb = skb_recv_datagram(sk, flags, noblock, &error); if (!skb) return error; if (skb->len < size) size = skb->len; err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; } sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(sizeof(struct sockaddr_can)); msg->msg_namelen = sizeof(struct sockaddr_can); memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } skb_free_datagram(sk, skb); return size; } static const struct proto_ops bcm_ops = { .family = PF_CAN, .release = bcm_release, .bind = sock_no_bind, .connect = bcm_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; static struct proto bcm_proto __read_mostly = { .name = "CAN_BCM", .owner = THIS_MODULE, .obj_size = sizeof(struct bcm_sock), .init = bcm_init, }; static const struct can_proto bcm_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_BCM, .ops = &bcm_ops, .prot = &bcm_proto, }; static int canbcm_pernet_init(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* create /proc/net/can-bcm directory */ net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ return 0; } static void canbcm_pernet_exit(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* remove /proc/net/can-bcm directory */ if (net->can.bcmproc_dir) remove_proc_entry("can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ } static struct pernet_operations canbcm_pernet_ops __read_mostly = { .init = canbcm_pernet_init, .exit = canbcm_pernet_exit, }; static struct notifier_block canbcm_notifier = { .notifier_call = bcm_notifier }; static int __init bcm_module_init(void) { int err; pr_info("can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n"); err = can_proto_register(&bcm_can_proto); if (err < 0) { printk(KERN_ERR "can: registration of bcm protocol failed\n"); return err; } register_pernet_subsys(&canbcm_pernet_ops); register_netdevice_notifier(&canbcm_notifier); return 0; } static void __exit bcm_module_exit(void) { can_proto_unregister(&bcm_can_proto); unregister_netdevice_notifier(&canbcm_notifier); unregister_pernet_subsys(&canbcm_pernet_ops); } module_init(bcm_module_init); module_exit(bcm_module_exit); |
4294 4213 4214 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | // SPDX-License-Identifier: GPL-2.0 /* bounce buffer handling for block devices * * - Split from highmem.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/pagemap.h> #include <linux/mempool.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> #include <linux/bootmem.h> #include <linux/printk.h> #include <asm/tlbflush.h> #include <trace/events/block.h> #include "blk.h" #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 static struct bio_set bounce_bio_set, bounce_bio_split; static mempool_t page_pool, isa_page_pool; static void init_bounce_bioset(void) { static bool bounce_bs_setup; int ret; if (bounce_bs_setup) return; ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); BUG_ON(ret); if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) BUG_ON(1); ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); BUG_ON(ret); bounce_bs_setup = true; } #if defined(CONFIG_HIGHMEM) static __init int init_emergency_pool(void) { int ret; #if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) if (max_pfn <= max_low_pfn) return 0; #endif ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0); BUG_ON(ret); pr_info("pool size: %d pages\n", POOL_SIZE); init_bounce_bioset(); return 0; } __initcall(init_emergency_pool); #endif #ifdef CONFIG_HIGHMEM /* * highmem version, map in to vec */ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) { unsigned char *vto; vto = kmap_atomic(to->bv_page); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto); } #else /* CONFIG_HIGHMEM */ #define bounce_copy_vec(to, vfrom) \ memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) #endif /* CONFIG_HIGHMEM */ /* * allocate pages in the DMA region for the ISA pool */ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) { return mempool_alloc_pages(gfp_mask | GFP_DMA, data); } static DEFINE_MUTEX(isa_mutex); /* * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA * as the max address, so check if the pool has already been created. */ int init_emergency_isa_pool(void) { int ret; mutex_lock(&isa_mutex); if (mempool_initialized(&isa_page_pool)) { mutex_unlock(&isa_mutex); return 0; } ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa, mempool_free_pages, (void *) 0); BUG_ON(ret); pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); init_bounce_bioset(); mutex_unlock(&isa_mutex); return 0; } /* * Simple bounce buffer support for highmem pages. Depending on the * queue gfp mask set, *to may or may not be a highmem page. kmap it * always, it will do the Right Thing */ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { unsigned char *vfrom; struct bio_vec tovec, fromvec; struct bvec_iter iter; /* * The bio of @from is created by bounce, so we can iterate * its bvec from start to end, but the @from->bi_iter can't be * trusted because it might be changed by splitting. */ struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; bio_for_each_segment(tovec, to, iter) { fromvec = bio_iter_iovec(from, from_iter); if (tovec.bv_page != fromvec.bv_page) { /* * fromvec->bv_offset and fromvec->bv_len might have * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ vfrom = page_address(fromvec.bv_page) + tovec.bv_offset; bounce_copy_vec(&tovec, vfrom); flush_dcache_page(tovec.bv_page); } bio_advance_iter(from, &from_iter, tovec.bv_len); } } static void bounce_end_io(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, orig_vec; int i; struct bvec_iter orig_iter = bio_orig->bi_iter; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { orig_vec = bio_iter_iovec(bio_orig, orig_iter); if (bvec->bv_page != orig_vec.bv_page) { dec_zone_page_state(bvec->bv_page, NR_BOUNCE); mempool_free(bvec->bv_page, pool); } bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); } bio_orig->bi_status = bio->bi_status; bio_endio(bio_orig); bio_put(bio); } static void bounce_end_io_write(struct bio *bio) { bounce_end_io(bio, &page_pool); } static void bounce_end_io_write_isa(struct bio *bio) { bounce_end_io(bio, &isa_page_pool); } static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; if (!bio->bi_status) copy_to_high_bio_irq(bio_orig, bio); bounce_end_io(bio, pool); } static void bounce_end_io_read(struct bio *bio) { __bounce_end_io_read(bio, &page_pool); } static void bounce_end_io_read_isa(struct bio *bio) { __bounce_end_io_read(bio, &isa_page_pool); } static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, struct bio_set *bs) { struct bvec_iter iter; struct bio_vec bv; struct bio *bio; /* * Pre immutable biovecs, __bio_clone() used to just do a memcpy from * bio_src->bi_io_vec to bio->bi_io_vec. * * We can't do that anymore, because: * * - The point of cloning the biovec is to produce a bio with a biovec * the caller can modify: bi_idx and bi_bvec_done should be 0. * * - The original bio could've had more than BIO_MAX_PAGES biovecs; if * we tried to clone the whole thing bio_alloc_bioset() would fail. * But the clone should succeed as long as the number of biovecs we * actually need to allocate is fewer than BIO_MAX_PAGES. * * - Lastly, bi_vcnt should not be looked at or relied upon by code * that does not own the bio - reason being drivers don't use it for * iterating over the biovec anymore, so expecting it to be kept up * to date (i.e. for clones that share the parent biovec) is just * asking for trouble and would force extra work on * __bio_clone_fast() anyways. */ bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; bio->bi_disk = bio_src->bi_disk; bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: break; case REQ_OP_WRITE_SAME: bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; break; default: bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; break; } if (bio_integrity(bio_src)) { int ret; ret = bio_integrity_clone(bio, bio_src, gfp_mask); if (ret < 0) { bio_put(bio); return NULL; } } bio_clone_blkcg_association(bio, bio_src); return bio; } static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, mempool_t *pool) { struct bio *bio; int rw = bio_data_dir(*bio_orig); struct bio_vec *to, from; struct bvec_iter iter; unsigned i = 0; bool bounce = false; int sectors = 0; bool passthrough = bio_is_passthrough(*bio_orig); bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_PAGES) sectors += from.bv_len >> 9; if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) bounce = true; } if (!bounce) return; if (!passthrough && sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); generic_make_request(*bio_orig); *bio_orig = bio; } bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : &bounce_bio_set); bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; if (page_to_pfn(page) <= q->limits.bounce_pfn) continue; to->bv_page = mempool_alloc(pool, q->bounce_gfp); inc_zone_page_state(to->bv_page, NR_BOUNCE); if (rw == WRITE) { char *vto, *vfrom; flush_dcache_page(page); vto = page_address(to->bv_page) + to->bv_offset; vfrom = kmap_atomic(page) + to->bv_offset; memcpy(vto, vfrom, to->bv_len); kunmap_atomic(vfrom); } } trace_block_bio_bounce(q, *bio_orig); bio->bi_flags |= (1 << BIO_BOUNCED); if (pool == &page_pool) { bio->bi_end_io = bounce_end_io_write; if (rw == READ) bio->bi_end_io = bounce_end_io_read; } else { bio->bi_end_io = bounce_end_io_write_isa; if (rw == READ) bio->bi_end_io = bounce_end_io_read_isa; } bio->bi_private = *bio_orig; *bio_orig = bio; } void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { mempool_t *pool; /* * Data-less bio, nothing to bounce */ if (!bio_has_data(*bio_orig)) return; /* * for non-isa bounce case, just check if the bounce pfn is equal * to or bigger than the highest pfn in the system -- in that case, * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { if (q->limits.bounce_pfn >= blk_max_pfn) return; pool = &page_pool; } else { BUG_ON(!mempool_initialized(&isa_page_pool)); pool = &isa_page_pool; } /* * slow path */ __blk_queue_bounce(q, bio_orig, pool); } |
203 203 203 203 28 26 28 28 28 28 28 110 110 110 109 110 110 186 187 103 6 30 187 167 166 187 52 49 50 1 50 51 27 27 110 110 110 132 132 110 110 132 132 113 7 7 110 184 208 51 31 1 41 11 38 38 157 157 157 12 12 12 12 42 25 40 40 40 41 41 25 38 38 38 38 36 38 38 38 38 38 38 41 41 40 40 40 40 40 40 40 8 39 38 38 38 38 38 38 97 97 97 97 107 99 107 107 107 1 1 1 1 1 1 1 1 1 1 1 38 29 29 29 29 29 29 29 29 29 29 26 25 24 24 24 24 24 24 187 187 187 187 187 187 187 187 133 133 133 133 97 133 133 133 133 36 97 97 6 54 55 1 54 55 55 1 1 1 37 29 29 26 1 1 26 26 26 26 26 24 24 24 24 24 24 23 23 23 23 23 23 23 23 23 23 23 23 23 22 18 17 1 17 17 17 16 16 16 16 16 1 1 43 43 43 43 43 58 134 134 134 105 105 104 105 105 105 105 104 105 105 104 105 105 105 106 106 104 106 100 106 106 104 104 104 104 61 61 61 61 61 61 107 106 105 105 105 105 105 105 105 23 183 180 180 184 183 61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> #include <linux/uuid.h> #include <linux/semaphore.h> #include <linux/error-injection.h> #include <linux/crc32c.h> #include <linux/sched/mm.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" #include "print-tree.h" #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" #include "sysfs.h" #include "qgroup.h" #include "compression.h" #include "tree-checker.h" #include "ref-verify.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> #endif #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ BTRFS_SUPER_FLAG_SEEDING |\ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, struct extent_io_tree *pinned_extents); static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); /* * btrfs_end_io_wq structs are used to do processing in task context when an IO * is complete. This is used during reads to verify checksums, and it is used * by writes to insert metadata for new file extents after IO is complete. */ struct btrfs_end_io_wq { struct bio *bio; bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; blk_status_t status; enum btrfs_wq_endio_type metadata; struct btrfs_work work; }; static struct kmem_cache *btrfs_end_io_wq_cache; int __init btrfs_end_io_wq_init(void) { btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", sizeof(struct btrfs_end_io_wq), 0, SLAB_MEM_SPREAD, NULL); if (!btrfs_end_io_wq_cache) return -ENOMEM; return 0; } void __cold btrfs_end_io_wq_exit(void) { kmem_cache_destroy(btrfs_end_io_wq_cache); } /* * async submit bios are used to offload expensive checksumming * onto the worker threads. They checksum file and metadata bios * just before they are sent down the IO stack. */ struct async_submit_bio { void *private_data; struct bio *bio; extent_submit_bio_start_t *submit_bio_start; int mirror_num; /* * bio_offset is optional, can be used if the pages in the bio * can't tell us where in the file the bio should go */ u64 bio_offset; struct btrfs_work work; blk_status_t status; }; /* * Lockdep class keys for extent_buffer->lock's in this root. For a given * eb, the lockdep key is determined by the btrfs_root it belongs to and * the level the eb occupies in the tree. * * Different roots are used for different purposes and may nest inside each * other and they require separate keysets. As lockdep keys should be * static, assign keysets according to the purpose of the root as indicated * by btrfs_root->objectid. This ensures that all special purpose roots * have separate keysets. * * Lock-nesting across peer nodes is always done with the immediate parent * node locked thus preventing deadlock. As lockdep doesn't know this, use * subclass to avoid triggering lockdep warning in such cases. * * The key is set by the readpage_end_io_hook after the buffer has passed * csum validation but before the pages are unlocked. It is also set by * btrfs_init_new_buffer on freshly allocated blocks. * * We also add a check to make sure the highest level of the tree is the * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code * needs update as well. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC # if BTRFS_MAX_LEVEL != 8 # error # endif static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ const char *name_stem; /* lock name stem */ char names[BTRFS_MAX_LEVEL + 1][20]; struct lock_class_key keys[BTRFS_MAX_LEVEL + 1]; } btrfs_lockdep_keysets[] = { { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" }, { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" }, { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" }, { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" }, { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" }, { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" }, { .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" }, { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" }, { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" }, { .id = 0, .name_stem = "tree" }, }; void __init btrfs_init_lockdep(void) { int i, j; /* initialize lockdep class names */ for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) { struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i]; for (j = 0; j < ARRAY_SIZE(ks->names); j++) snprintf(ks->names[j], sizeof(ks->names[j]), "btrfs-%s-%02d", ks->name_stem, j); } } void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) { struct btrfs_lockdep_keyset *ks; BUG_ON(level >= ARRAY_SIZE(ks->keys)); /* find the matching keyset, id 0 is the default entry */ for (ks = btrfs_lockdep_keysets; ks->id; ks++) if (ks->id == objectid) break; lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); } #endif /* * extents on the btree inode are pretty simple, there's one extent * that covers the entire device */ struct extent_map *btree_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; int ret; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) { em->bdev = fs_info->fs_devices->latest_bdev; read_unlock(&em_tree->lock); goto out; } read_unlock(&em_tree->lock); em = alloc_extent_map(); if (!em) { em = ERR_PTR(-ENOMEM); goto out; } em->start = 0; em->len = (u64)-1; em->block_len = (u64)-1; em->block_start = 0; em->bdev = fs_info->fs_devices->latest_bdev; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); if (ret == -EEXIST) { free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); if (!em) em = ERR_PTR(-EIO); } else if (ret) { free_extent_map(em); em = ERR_PTR(ret); } write_unlock(&em_tree->lock); out: return em; } u32 btrfs_csum_data(const char *data, u32 seed, size_t len) { return crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, u8 *result) { put_unaligned_le32(~crc, result); } /* * compute the csum for a btree block, and either verify it or write it * into the csum field of the block. */ static int csum_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *buf, int verify) { u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); char result[BTRFS_CSUM_SIZE]; unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; char *kaddr; unsigned long map_start; unsigned long map_len; int err; u32 crc = ~(u32)0; len = buf->len - offset; while (len > 0) { err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); if (err) return err; cur_len = min(len, map_len - (offset - map_start)); crc = btrfs_csum_data(kaddr + offset - map_start, crc, cur_len); len -= cur_len; offset += cur_len; } memset(result, 0, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); if (verify) { if (memcmp_extent_buffer(buf, result, 0, csum_size)) { u32 val; u32 found = 0; memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); btrfs_warn_rl(fs_info, "%s checksum verify failed on %llu wanted %X found %X level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); return -EUCLEAN; } } else { write_extent_buffer(buf, result, 0, csum_size); } return 0; } /* * we can't consider a given block up to date unless the transid of the * block matches the transid in the parent node's pointer. This is how we * detect blocks that either didn't get written at all or got written * in the wrong place. */ static int verify_parent_transid(struct extent_io_tree *io_tree, struct extent_buffer *eb, u64 parent_transid, int atomic) { struct extent_state *cached_state = NULL; int ret; bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; if (atomic) return -EAGAIN; if (need_lock) { btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; goto out; } btrfs_err_rl(eb->fs_info, "parent transid verify failed on %llu wanted %llu found %llu", eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; /* * Things reading via commit roots that don't have normal protection, * like send, can have a really old block in cache that may point at a * block that has been freed and re-allocated. So don't clear uptodate * if we find an eb that is under IO (dirty/writeback) because we could * end up reading in the stale data and then writing it back out and * making everybody very sad. */ if (!extent_buffer_under_io(eb)) clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (need_lock) btrfs_tree_read_unlock_blocking(eb); return ret; } /* * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. */ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, char *raw_disk_sb) { struct btrfs_super_block *disk_sb = (struct btrfs_super_block *)raw_disk_sb; u16 csum_type = btrfs_super_csum_type(disk_sb); int ret = 0; if (csum_type == BTRFS_CSUM_TYPE_CRC32) { u32 crc = ~(u32)0; char result[sizeof(crc)]; /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space * is filled with zeros and is included in the checksum. */ crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); btrfs_csum_final(crc, result); if (memcmp(raw_disk_sb, result, sizeof(result))) ret = 1; } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { btrfs_err(fs_info, "unsupported checksum algorithm %u", csum_type); ret = 1; } return ret; } int btrfs_verify_level_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid) { int found_level; struct btrfs_key found_key; int ret; found_level = btrfs_header_level(eb); if (found_level != level) { #ifdef CONFIG_BTRFS_DEBUG WARN_ON(1); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, level, found_level); #endif return -EIO; } if (!first_key) return 0; /* * For live tree block (new tree blocks in current transaction), * we need proper lock context to avoid race, which is impossible here. * So we only checks tree blocks which is read from disk, whose * generation <= fs_info->last_trans_committed. */ if (btrfs_header_generation(eb) > fs_info->last_trans_committed) return 0; /* We have @first_key, so this @eb must have at least one item */ if (btrfs_header_nritems(eb) == 0) { btrfs_err(fs_info, "invalid tree nritems, bytenr=%llu nritems=0 expect >0", eb->start); WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); return -EUCLEAN; } if (found_level) btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); ret = btrfs_comp_cpu_keys(first_key, &found_key); #ifdef CONFIG_BTRFS_DEBUG if (ret) { WARN_ON(1); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, parent_transid, first_key->objectid, first_key->type, first_key->offset, found_key.objectid, found_key.type, found_key.offset); } #endif return ret; } /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. * * @parent_transid: expected transid, skip check if 0 * @level: expected level, mandatory check * @first_key: expected key of first slot, skip check if NULL */ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, u64 parent_transid, int level, struct btrfs_key *first_key) { struct extent_io_tree *io_tree; int failed = 0; int ret; int num_copies = 0; int mirror_num = 0; int failed_mirror = 0; io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, mirror_num); if (!ret) { if (verify_parent_transid(io_tree, eb, parent_transid, 0)) ret = -EIO; else if (btrfs_verify_level_key(fs_info, eb, level, first_key, parent_transid)) ret = -EUCLEAN; else break; } num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); if (num_copies == 1) break; if (!failed_mirror) { failed = 1; failed_mirror = eb->read_mirror; } mirror_num++; if (mirror_num == failed_mirror) mirror_num++; if (mirror_num > num_copies) break; } if (failed && !ret && failed_mirror) repair_eb_io_failure(fs_info, eb, failed_mirror); return ret; } /* * checksum a dirty tree block before IO. This has extra checks to make sure * we only fill in the checksum field in the first page of a multi-page block */ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) { u64 start = page_offset(page); u64 found_start; struct extent_buffer *eb; eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) return 0; found_start = btrfs_header_bytenr(eb); /* * Please do not consolidate these warnings into a single if. * It is useful to know what went wrong. */ if (WARN_ON(found_start != start)) return -EUCLEAN; if (WARN_ON(!PageUptodate(page))) return -EUCLEAN; ASSERT(memcmp_extent_buffer(eb, fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); return csum_tree_block(fs_info, eb, 0); } static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); while (fs_devices) { if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { ret = 0; break; } fs_devices = fs_devices->seed; } return ret; } static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) { u64 found_start; int found_level; struct extent_buffer *eb; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int reads_done; if (!page->private) goto out; eb = (struct extent_buffer *)page->private; /* the pending IO might have been the only thing that kept this buffer * in memory. Make sure we have a ref for all this other checks */ extent_buffer_get(eb); reads_done = atomic_dec_and_test(&eb->io_pages); if (!reads_done) goto err; eb->read_mirror = mirror; if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { ret = -EIO; goto err; } found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", eb->start, found_start); ret = -EIO; goto err; } if (check_tree_block_fsid(fs_info, eb)) { btrfs_err_rl(fs_info, "bad fsid on block %llu", eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "bad tree block level %d on %llu", (int)btrfs_header_level(eb), eb->start); ret = -EIO; goto err; } btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); ret = csum_tree_block(fs_info, eb, 1); if (ret) goto err; /* * If this is a leaf block and it is corrupt, set the corrupt bit so * that we don't try and read the other copies of this block, just * return -EIO. */ if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } if (found_level > 0 && btrfs_check_node(fs_info, eb)) ret = -EIO; if (!ret) set_extent_buffer_uptodate(eb); err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(eb, ret); if (ret) { /* * our io error hook is going to dec the io pages * again, we have to make sure it has something * to decrement */ atomic_inc(&eb->io_pages); clear_extent_buffer_uptodate(eb); } free_extent_buffer(eb); out: return ret; } static int btree_io_failed_hook(struct page *page, int failed_mirror) { struct extent_buffer *eb; eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(eb, -EIO); return -EIO; /* we fixed nothing */ } static void end_workqueue_bio(struct bio *bio) { struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; struct btrfs_workqueue *wq; btrfs_work_func_t func; fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { wq = fs_info->endio_meta_write_workers; func = btrfs_endio_meta_write_helper; } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) { wq = fs_info->endio_freespace_worker; func = btrfs_freespace_write_helper; } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { wq = fs_info->endio_raid56_workers; func = btrfs_endio_raid56_helper; } else { wq = fs_info->endio_write_workers; func = btrfs_endio_write_helper; } } else { if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR)) { wq = fs_info->endio_repair_workers; func = btrfs_endio_repair_helper; } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { wq = fs_info->endio_raid56_workers; func = btrfs_endio_raid56_helper; } else if (end_io_wq->metadata) { wq = fs_info->endio_meta_workers; func = btrfs_endio_meta_helper; } else { wq = fs_info->endio_workers; func = btrfs_endio_helper; } } btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL); btrfs_queue_work(wq, &end_io_wq->work); } blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata) { struct btrfs_end_io_wq *end_io_wq; end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) return BLK_STS_RESOURCE; end_io_wq->private = bio->bi_private; end_io_wq->end_io = bio->bi_end_io; end_io_wq->info = info; end_io_wq->status = 0; end_io_wq->bio = bio; end_io_wq->metadata = metadata; bio->bi_private = end_io_wq; bio->bi_end_io = end_workqueue_bio; return 0; } static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; blk_status_t ret; async = container_of(work, struct async_submit_bio, work); ret = async->submit_bio_start(async->private_data, async->bio, async->bio_offset); if (ret) async->status = ret; } static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; async = container_of(work, struct async_submit_bio, work); /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { async->bio->bi_status = async->status; bio_endio(async->bio); return; } btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) { struct async_submit_bio *async; async = container_of(work, struct async_submit_bio, work); kfree(async); } blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, void *private_data, extent_submit_bio_start_t *submit_bio_start) { struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) return BLK_STS_RESOURCE; async->private_data = private_data; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, run_one_async_done, run_one_async_free); async->bio_offset = bio_offset; async->status = 0; if (op_is_sync(bio->bi_opf)) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); return 0; } static blk_status_t btree_csum_one_bio(struct bio *bio) { struct bio_vec *bvec; struct btrfs_root *root; int i, ret = 0; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) break; } return errno_to_blk_status(ret); } static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, u64 bio_offset) { /* * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ return btree_csum_one_bio(bio); } static int check_async_write(struct btrfs_inode *bi) { if (atomic_read(&bi->sync_writers)) return 0; #ifdef CONFIG_X86 if (static_cpu_has(X86_FEATURE_XMM4_2)) return 0; #endif return 1; } static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads */ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0, bio_offset, private_data, btree_submit_bio_start); } if (ret) goto out_w_error; return 0; out_w_error: bio->bi_status = ret; bio_endio(bio); return ret; } #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) { /* * we can't safely write a btree page from here, * we haven't done the locking hook */ if (PageDirty(page)) return -EAGAIN; /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; return migrate_page(mapping, newpage, page, mode); } #endif static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info; int ret; if (wbc->sync_mode == WB_SYNC_NONE) { if (wbc->for_kupdate) return 0; fs_info = BTRFS_I(mapping->host)->root->fs_info; /* this is a bit racy, but that's ok */ ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, fs_info->dirty_metadata_batch); if (ret < 0) return 0; } return btree_write_cache_pages(mapping, wbc); } static int btree_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; return extent_read_full_page(tree, page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; return try_release_extent_buffer(page); } static void btree_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; extent_invalidatepage(tree, page, offset); btree_releasepage(page, GFP_NOFS); if (PagePrivate(page)) { btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, "page private not zero on page %llu", (unsigned long long)page_offset(page)); ClearPagePrivate(page); set_page_private(page, 0); put_page(page); } } static int btree_set_page_dirty(struct page *page) { #ifdef DEBUG struct extent_buffer *eb; BUG_ON(!PagePrivate(page)); eb = (struct extent_buffer *)page->private; BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_locked(eb); #endif return __set_page_dirty_nobuffers(page); } static const struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif .set_page_dirty = btree_set_page_dirty, }; void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { struct extent_buffer *buf = NULL; struct inode *btree_inode = fs_info->btree_inode; int ret; buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return; ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, WAIT_NONE, 0); if (ret < 0) free_extent_buffer_stale(buf); else free_extent_buffer(buf); } int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, int mirror_num, struct extent_buffer **eb) { struct extent_buffer *buf = NULL; struct inode *btree_inode = fs_info->btree_inode; struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return 0; set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, mirror_num); if (ret) { free_extent_buffer_stale(buf); return ret; } if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { free_extent_buffer_stale(buf); return -EIO; } else if (extent_buffer_uptodate(buf)) { *eb = buf; } else { free_extent_buffer(buf); } return 0; } struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr) { if (btrfs_is_testing(fs_info)) return alloc_test_extent_buffer(fs_info, bytenr); return alloc_extent_buffer(fs_info, bytenr); } int btrfs_write_tree_block(struct extent_buffer *buf) { return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } void btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { filemap_fdatawait_range(buf->pages[0]->mapping, buf->start, buf->start + buf->len - 1); } /* * Read tree block at logical address @bytenr and do variant basic but critical * verification. * * @parent_transid: expected transid of this tree block, skip check if 0 * @level: expected level, mandatory check * @first_key: expected key in slot 0, skip check if NULL */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid, int level, struct btrfs_key *first_key) { struct extent_buffer *buf = NULL; int ret; buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return buf; ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid, level, first_key); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } return buf; } void clean_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *buf) { if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -buf->len, fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); } } } static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) { struct btrfs_subvolume_writers *writers; int ret; writers = kmalloc(sizeof(*writers), GFP_NOFS); if (!writers) return ERR_PTR(-ENOMEM); ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS); if (ret < 0) { kfree(writers); return ERR_PTR(ret); } init_waitqueue_head(&writers->wait); return writers; } static void btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) { percpu_counter_destroy(&writers->counter); kfree(writers); } static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); root->node = NULL; root->commit_root = NULL; root->state = 0; root->orphan_cleanup_state = 0; root->objectid = objectid; root->last_trans = 0; root->highest_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->root_list); INIT_LIST_HEAD(&root->delalloc_inodes); INIT_LIST_HEAD(&root->delalloc_root); INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->logged_list[0]); INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); spin_lock_init(&root->log_extents_lock[0]); spin_lock_init(&root->log_extents_lock[1]); spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); INIT_LIST_HEAD(&root->log_ctxs[0]); INIT_LIST_HEAD(&root->log_ctxs[1]); atomic_set(&root->log_commit[0], 0); atomic_set(&root->log_commit[1], 0); atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; if (!dummy) extent_io_tree_init(&root->dirty_log_pages, NULL); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); if (!dummy) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; root->root_key.objectid = objectid; root->anon_dev = 0; spin_lock_init(&root->root_item_lock); } static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, gfp_t flags) { struct btrfs_root *root = kzalloc(sizeof(*root), flags); if (root) root->fs_info = fs_info; return root; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; if (!fs_info) return ERR_PTR(-EINVAL); root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; } #endif struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 objectid) { struct extent_buffer *leaf; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key key; unsigned int nofs_flag; int ret = 0; uuid_le uuid = NULL_UUID_LE; /* * We're holding a transaction handle, so use a NOFS memory allocation * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); root = btrfs_alloc_root(fs_info, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); __setup_root(root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; goto fail; } root->node = leaf; btrfs_mark_buffer_dirty(leaf); root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); root->root_item.flags = 0; root->root_item.byte_limit = 0; btrfs_set_root_bytenr(&root->root_item, leaf->start); btrfs_set_root_generation(&root->root_item, trans->transid); btrfs_set_root_level(&root->root_item, 0); btrfs_set_root_refs(&root->root_item, 1); btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); if (is_fstree(objectid)) uuid_le_gen(&uuid); memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); root->root_item.drop_level = 0; key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = 0; ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); if (ret) goto fail; btrfs_tree_unlock(leaf); return root; fail: if (leaf) { btrfs_tree_unlock(leaf); free_extent_buffer(root->commit_root); free_extent_buffer(leaf); } kfree(root); return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *root; struct extent_buffer *leaf; root = btrfs_alloc_root(fs_info, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID); root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; /* * DON'T set REF_COWS for log trees * * log trees do not get reference counted because they go away * before a real commit is actually done. They do store pointers * to file data extents, and those reference counts still get * updated (along with back refs to the log tree). */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); } root->node = leaf; btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; } int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); log_root->last_trans = trans->transid; log_root->root_key.offset = root->root_key.objectid; inode_item = &log_root->root_item.inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, fs_info->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; return 0; } static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; u64 generation; int ret; int level; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = btrfs_alloc_root(fs_info, GFP_NOFS); if (!root) { ret = -ENOMEM; goto alloc_fail; } __setup_root(root, fs_info, key->objectid); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { if (ret > 0) ret = -ENOENT; goto find_fail; } generation = btrfs_root_generation(&root->root_item); level = btrfs_root_level(&root->root_item); root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), generation, level, NULL); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; free_extent_buffer(root->node); goto find_fail; } root->commit_root = btrfs_root_node(root); out: btrfs_free_path(path); return root; find_fail: kfree(root); alloc_fail: root = ERR_PTR(ret); goto out; } struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location) { struct btrfs_root *root; root = btrfs_read_tree_root(tree_root, location); if (IS_ERR(root)) return root; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { set_bit(BTRFS_ROOT_REF_COWS, &root->state); btrfs_check_and_init_root_item(&root->root_item); } return root; } int btrfs_init_fs_root(struct btrfs_root *root) { int ret; struct btrfs_subvolume_writers *writers; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), GFP_NOFS); if (!root->free_ino_pinned || !root->free_ino_ctl) { ret = -ENOMEM; goto fail; } writers = btrfs_alloc_subvolume_writers(); if (IS_ERR(writers)) { ret = PTR_ERR(writers); goto fail; } root->subv_writers = writers; btrfs_init_free_ino_ctl(root); spin_lock_init(&root->ino_cache_lock); init_waitqueue_head(&root->ino_cache_wait); /* * Don't assign anonymous block device to roots that are not exposed to * userspace, the id pool is limited to 1M */ if (is_fstree(root->root_key.objectid) && btrfs_root_refs(&root->root_item) > 0) { ret = get_anon_bdev(&root->anon_dev); if (ret) goto fail; } mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, &root->highest_objectid); if (ret) { mutex_unlock(&root->objectid_mutex); goto fail; } ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); return 0; fail: /* The caller is responsible to call btrfs_free_fs_root */ return ret; } struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, u64 root_id) { struct btrfs_root *root; spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { int ret; ret = radix_tree_preload(GFP_NOFS); if (ret) return ret; spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); if (ret == 0) set_bit(BTRFS_ROOT_IN_RADIX, &root->state); spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); return ret; } struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, bool check_ref) { struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) return fs_info->tree_root; if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) return fs_info->extent_root; if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) return fs_info->chunk_root; if (location->objectid == BTRFS_DEV_TREE_OBJECTID) return fs_info->dev_root; if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) return fs_info->csum_root; if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) return fs_info->quota_root ? fs_info->quota_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_UUID_TREE_OBJECTID) return fs_info->uuid_root ? fs_info->uuid_root : ERR_PTR(-ENOENT); if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) return fs_info->free_space_root ? fs_info->free_space_root : ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { if (check_ref && btrfs_root_refs(&root->root_item) == 0) return ERR_PTR(-ENOENT); return root; } root = btrfs_read_fs_root(fs_info->tree_root, location); if (IS_ERR(root)) return root; if (check_ref && btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; goto fail; } ret = btrfs_init_fs_root(root); if (ret) goto fail; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; } key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = location->objectid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); btrfs_free_path(path); if (ret < 0) goto fail; if (ret == 0) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); ret = btrfs_insert_fs_root(fs_info, root); if (ret) { if (ret == -EEXIST) { btrfs_free_fs_root(root); goto again; } goto fail; } return root; fail: btrfs_free_fs_root(root); return ERR_PTR(ret); } static int btrfs_congested_fn(void *congested_data, int bdi_bits) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; int ret = 0; struct btrfs_device *device; struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; bdi = device->bdev->bd_bdi; if (bdi_congested(bdi, bdi_bits)) { ret = 1; break; } } rcu_read_unlock(); return ret; } /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens */ static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; struct btrfs_end_io_wq *end_io_wq; end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; bio->bi_status = end_io_wq->status; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; bio_endio(bio); kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); } static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; struct btrfs_fs_info *fs_info = root->fs_info; int again; while (1) { again = 0; /* Make the cleaner go to sleep early. */ if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; /* * Do not do anything if we might cause open_ctree() to block * before we have finished mounting the filesystem. */ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) goto sleep; if (!mutex_trylock(&fs_info->cleaner_mutex)) goto sleep; /* * Avoid the problem that we change the status of the fs * during the above check and trylock. */ if (btrfs_need_cleaner_sleep(fs_info)) { mutex_unlock(&fs_info->cleaner_mutex); goto sleep; } mutex_lock(&fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&fs_info->cleaner_mutex); /* * The defragger has dealt with the R/O remount and umount, * needn't do anything special here. */ btrfs_run_defrag_inodes(fs_info); /* * Acquires fs_info->delete_unused_bgs_mutex to avoid racing * with relocation (btrfs_relocate_chunk) and relocation * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) * after acquiring fs_info->delete_unused_bgs_mutex. So we * can't hold, nor need to, fs_info->cleaner_mutex when deleting * unused block groups. */ btrfs_delete_unused_bgs(fs_info); sleep: if (kthread_should_park()) kthread_parkme(); if (kthread_should_stop()) return 0; if (!again) { set_current_state(TASK_INTERRUPTIBLE); schedule(); __set_current_state(TASK_RUNNING); } } } static int transaction_kthread(void *arg) { struct btrfs_root *root = arg; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; time64_t now; unsigned long delay; bool cannot_commit; do { cannot_commit = false; delay = HZ * fs_info->commit_interval; mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); cur = fs_info->running_transaction; if (!cur) { spin_unlock(&fs_info->trans_lock); goto sleep; } now = ktime_get_seconds(); if (cur->state < TRANS_STATE_BLOCKED && !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); delay = HZ * 5; goto sleep; } transid = cur->transid; spin_unlock(&fs_info->trans_lock); /* If the file system is aborted, this will always fail. */ trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) cannot_commit = true; goto sleep; } if (transid == trans->transid) { btrfs_commit_transaction(trans); } else { btrfs_end_transaction(trans); } sleep: wake_up_process(fs_info->cleaner_kthread); mutex_unlock(&fs_info->transaction_kthread_mutex); if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))) btrfs_cleanup_transaction(fs_info); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || cannot_commit)) schedule_timeout_interruptible(delay); } while (!kthread_should_stop()); return 0; } /* * this will find the highest generation in the array of * root backups. The index of the highest array is returned, * or -1 if we can't find anything. * * We check to make sure the array is valid by comparing the * generation of the latest root in the array with the generation * in the super block. If they don't match we pitch it. */ static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) { u64 cur; int newest_index = -1; struct btrfs_root_backup *root_backup; int i; for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { root_backup = info->super_copy->super_roots + i; cur = btrfs_backup_tree_root_gen(root_backup); if (cur == newest_gen) newest_index = i; } /* check to see if we actually wrapped around */ if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { root_backup = info->super_copy->super_roots; cur = btrfs_backup_tree_root_gen(root_backup); if (cur == newest_gen) newest_index = 0; } return newest_index; } /* * find the oldest backup so we know where to store new entries * in the backup array. This will set the backup_root_index * field in the fs_info struct */ static void find_oldest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) { int newest_index = -1; newest_index = find_newest_super_backup(info, newest_gen); /* if there was garbage in there, just move along */ if (newest_index == -1) { info->backup_root_index = 0; } else { info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; } } /* * copy all the root pointers into the super backup array. * this will bump the backup pointer by one when it is * done */ static void backup_super_roots(struct btrfs_fs_info *info) { int next_backup; struct btrfs_root_backup *root_backup; int last_backup; next_backup = info->backup_root_index; last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % BTRFS_NUM_BACKUP_ROOTS; /* * just overwrite the last backup if we're at the same generation * this happens only at umount */ root_backup = info->super_for_commit->super_roots + last_backup; if (btrfs_backup_tree_root_gen(root_backup) == btrfs_header_generation(info->tree_root->node)) next_backup = last_backup; root_backup = info->super_for_commit->super_roots + next_backup; /* * make sure all of our padding and empty slots get zero filled * regardless of which ones we use today */ memset(root_backup, 0, sizeof(*root_backup)); info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); btrfs_set_backup_tree_root_gen(root_backup, btrfs_header_generation(info->tree_root->node)); btrfs_set_backup_tree_root_level(root_backup, btrfs_header_level(info->tree_root->node)); btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); btrfs_set_backup_chunk_root_gen(root_backup, btrfs_header_generation(info->chunk_root->node)); btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); btrfs_set_backup_extent_root_gen(root_backup, btrfs_header_generation(info->extent_root->node)); btrfs_set_backup_extent_root_level(root_backup, btrfs_header_level(info->extent_root->node)); /* * we might commit during log recovery, which happens before we set * the fs_root. Make sure it is valid before we fill it in. */ if (info->fs_root && info->fs_root->node) { btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start); btrfs_set_backup_fs_root_gen(root_backup, btrfs_header_generation(info->fs_root->node)); btrfs_set_backup_fs_root_level(root_backup, btrfs_header_level(info->fs_root->node)); } btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); btrfs_set_backup_dev_root_gen(root_backup, btrfs_header_generation(info->dev_root->node)); btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); btrfs_set_backup_csum_root_gen(root_backup, btrfs_header_generation(info->csum_root->node)); btrfs_set_backup_csum_root_level(root_backup, btrfs_header_level(info->csum_root->node)); btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); btrfs_set_backup_bytes_used(root_backup, btrfs_super_bytes_used(info->super_copy)); btrfs_set_backup_num_devices(root_backup, btrfs_super_num_devices(info->super_copy)); /* * if we don't copy this out to the super_copy, it won't get remembered * for the next commit */ memcpy(&info->super_copy->super_roots, &info->super_for_commit->super_roots, sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); } /* * this copies info out of the root backup array and back into * the in-memory super block. It is meant to help iterate through * the array, so you send it the number of backups you've already * tried and the last backup index you used. * * this returns -1 when it has tried all the backups */ static noinline int next_root_backup(struct btrfs_fs_info *info, struct btrfs_super_block *super, int *num_backups_tried, int *backup_index) { struct btrfs_root_backup *root_backup; int newest = *backup_index; if (*num_backups_tried == 0) { u64 gen = btrfs_super_generation(super); newest = find_newest_super_backup(info, gen); if (newest == -1) return -1; *backup_index = newest; *num_backups_tried = 1; } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { /* we've tried all the backups, all done */ return -1; } else { /* jump to the next oldest backup */ newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % BTRFS_NUM_BACKUP_ROOTS; *backup_index = newest; *num_backups_tried += 1; } root_backup = super->super_roots + newest; btrfs_set_super_generation(super, btrfs_backup_tree_root_gen(root_backup)); btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); btrfs_set_super_root_level(super, btrfs_backup_tree_root_level(root_backup)); btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); /* * fixme: the total bytes and num_devices need to match or we should * need a fsck */ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); return 0; } /* helper to cleanup workers */ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->submit_workers); btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->caching_workers); btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); btrfs_destroy_workqueue(fs_info->extent_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work * queues can do metadata I/O operations. */ btrfs_destroy_workqueue(fs_info->endio_meta_workers); btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); } static void free_root_extent_buffers(struct btrfs_root *root) { if (root) { free_extent_buffer(root->node); free_extent_buffer(root->commit_root); root->node = NULL; root->commit_root = NULL; } } /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) { free_root_extent_buffers(info->tree_root); free_root_extent_buffers(info->dev_root); free_root_extent_buffers(info->extent_root); free_root_extent_buffers(info->csum_root); free_root_extent_buffers(info->quota_root); free_root_extent_buffers(info->uuid_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); free_root_extent_buffers(info->free_space_root); } void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { int ret; struct btrfs_root *gang[8]; int i; while (!list_empty(&fs_info->dead_roots)) { gang[0] = list_entry(fs_info->dead_roots.next, struct btrfs_root, root_list); list_del(&gang[0]->root_list); if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) { btrfs_drop_and_free_fs_root(fs_info, gang[0]); } else { free_extent_buffer(gang[0]->node); free_extent_buffer(gang[0]->commit_root); btrfs_put_fs_root(gang[0]); } } while (1) { ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, 0, ARRAY_SIZE(gang)); if (!ret) break; for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log_root_tree(NULL, fs_info); btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); } } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->scrub_lock); atomic_set(&fs_info->scrubs_running, 0); atomic_set(&fs_info->scrub_pause_req, 0); atomic_set(&fs_info->scrubs_paused, 0); atomic_set(&fs_info->scrub_cancel_req, 0); init_waitqueue_head(&fs_info->scrub_pause_wait); fs_info->scrub_workers_refcnt = 0; } static void btrfs_init_balance(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->balance_lock); mutex_init(&fs_info->balance_mutex); atomic_set(&fs_info->balance_pause_req, 0); atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); } static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) { struct inode *inode = fs_info->btree_inode; inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; set_nlink(inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of * the devices in the system */ inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &btree_aops; RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode); BTRFS_I(inode)->io_tree.track_uptodate = 0; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; BTRFS_I(inode)->root = fs_info->tree_root; memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); rwlock_init(&fs_info->dev_replace.lock); atomic_set(&fs_info->dev_replace.read_locks, 0); atomic_set(&fs_info->dev_replace.blocking_readers, 0); init_waitqueue_head(&fs_info->replace_wait); init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) { spin_lock_init(&fs_info->qgroup_lock); mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; fs_info->qgroup_op_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->qgroup_ulist = NULL; fs_info->qgroup_rescan_running = false; mutex_init(&fs_info->qgroup_rescan_lock); } static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = btrfs_alloc_workqueue(fs_info, "delalloc", flags, max_active, 2); fs_info->flush_workers = btrfs_alloc_workqueue(fs_info, "flush_delalloc", flags, max_active, 0); fs_info->caching_workers = btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); /* * a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the * devices */ fs_info->submit_workers = btrfs_alloc_workqueue(fs_info, "submit", flags, min_t(u64, fs_devices->num_devices, max_active), 64); fs_info->fixup_workers = btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ fs_info->endio_workers = btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); fs_info->endio_meta_workers = btrfs_alloc_workqueue(fs_info, "endio-meta", flags, max_active, 4); fs_info->endio_meta_write_workers = btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, max_active, 2); fs_info->endio_raid56_workers = btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, max_active, 4); fs_info->endio_repair_workers = btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); fs_info->delayed_workers = btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); fs_info->readahead_workers = btrfs_alloc_workqueue(fs_info, "readahead", flags, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->extent_workers = btrfs_alloc_workqueue(fs_info, "extent-refs", flags, min_t(u64, fs_devices->num_devices, max_active), 8); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->submit_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && fs_info->endio_repair_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { return -ENOMEM; } return 0; } static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { int ret; struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); int level = btrfs_super_log_root_level(disk_super); if (fs_devices->rw_devices == 0) { btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!log_tree_root) return -ENOMEM; __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); log_tree_root->node = read_tree_block(fs_info, bytenr, fs_info->generation + 1, level, NULL); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); kfree(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); return -EIO; } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); return ret; } if (sb_rdonly(fs_info->sb)) { ret = btrfs_commit_super(fs_info); if (ret) return ret; } return 0; } static int btrfs_read_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key location; int ret; BUG_ON(!fs_info->tree_root); location.objectid = BTRFS_EXTENT_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->extent_root = root; location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; btrfs_init_devices_late(fs_info); location.objectid = BTRFS_CSUM_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->csum_root = root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); fs_info->quota_root = root; } location.objectid = BTRFS_UUID_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { ret = PTR_ERR(root); if (ret != -ENOENT) goto out; } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; } if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->free_space_root = root; } return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", location.objectid, ret); return ret; } /* * Real super block validation * NOTE: super csum type and incompat features will not be checked here. * * @sb: super block to check * @mirror_num: the super block number to check its bytenr: * 0 the primary (1st) sb * 1, 2 2nd and 3rd backup copy * -1 skip bytenr check */ static int validate_super(struct btrfs_fs_info *fs_info, struct btrfs_super_block *sb, int mirror_num) { u64 nodesize = btrfs_super_nodesize(sb); u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; if (btrfs_super_magic(sb) != BTRFS_MAGIC) { btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) { btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu", btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); ret = -EINVAL; } if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "chunk_root level too big: %d >= %d", btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { btrfs_err(fs_info, "log_root level too big: %d >= %d", btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } /* * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ if (!is_power_of_2(sectorsize) || sectorsize < 4096 || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* Only PAGE SIZE is supported yet */ if (sectorsize != PAGE_SIZE) { btrfs_err(fs_info, "sectorsize %llu not supported yet, only support %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); ret = -EINVAL; } if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { btrfs_err(fs_info, "invalid leafsize %u, should be %llu", le32_to_cpu(sb->__unused_leafsize), nodesize); ret = -EINVAL; } /* Root alignment check */ if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { btrfs_warn(fs_info, "tree_root block unaligned: %llu", btrfs_super_root(sb)); ret = -EINVAL; } if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { btrfs_warn(fs_info, "chunk_root block unaligned: %llu", btrfs_super_chunk_root(sb)); ret = -EINVAL; } if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { btrfs_warn(fs_info, "log_root block unaligned: %llu", btrfs_super_log_root(sb)); ret = -EINVAL; } if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "dev_item UUID does not match fsid: %pU != %pU", fs_info->fsid, sb->dev_item.fsid); ret = -EINVAL; } /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { btrfs_err(fs_info, "bytes_used is too small %llu", btrfs_super_bytes_used(sb)); ret = -EINVAL; } if (!is_power_of_2(btrfs_super_stripesize(sb))) { btrfs_err(fs_info, "invalid stripesize %u", btrfs_super_stripesize(sb)); ret = -EINVAL; } if (btrfs_super_num_devices(sb) > (1UL << 31)) btrfs_warn(fs_info, "suspicious number of devices: %llu", btrfs_super_num_devices(sb)); if (btrfs_super_num_devices(sb) == 0) { btrfs_err(fs_info, "number of devices is 0"); ret = -EINVAL; } if (mirror_num >= 0 && btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { btrfs_err(fs_info, "super offset mismatch %llu != %u", btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); ret = -EINVAL; } /* * Obvious sys_chunk_array corruptions, it must hold at least one key * and one chunk */ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { btrfs_err(fs_info, "system chunk array too big %u > %u", btrfs_super_sys_array_size(sb), BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); ret = -EINVAL; } if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)) { btrfs_err(fs_info, "system chunk array too small %u < %zu", btrfs_super_sys_array_size(sb), sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)); ret = -EINVAL; } /* * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) btrfs_warn(fs_info, "suspicious: generation < chunk_root_generation: %llu < %llu", btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb)); if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) && btrfs_super_cache_generation(sb) != (u64)-1) btrfs_warn(fs_info, "suspicious: generation < cache_generation: %llu < %llu", btrfs_super_generation(sb), btrfs_super_cache_generation(sb)); return ret; } /* * Validation of super block at mount time. * Some checks already done early at mount time, like csum type and incompat * flags will be skipped. */ static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info) { return validate_super(fs_info, fs_info->super_copy, 0); } /* * Validation of super block at write time. * Some checks like bytenr check will be skipped as their values will be * overwritten soon. * Extra checks like csum type and incompat flags will be done here. */ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, struct btrfs_super_block *sb) { int ret; ret = validate_super(fs_info, sb, -1); if (ret < 0) goto out; if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); goto out; } if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid incompat flags, has 0x%llx valid mask 0x%llx", btrfs_super_incompat_flags(sb), (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP); goto out; } out: if (ret < 0) btrfs_err(fs_info, "super block corruption detected before writing it to disk"); return ret; } int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) { u32 sectorsize; u32 nodesize; u32 stripesize; u64 generation; u64 features; struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; struct btrfs_root *chunk_root; int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; int clear_free_space_tree = 0; int level; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!tree_root || !chunk_root) { err = -ENOMEM; goto fail; } ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { err = ret; goto fail; } ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_srcu; } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_dirty_metadata_bytes; } ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; } INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->delalloc_roots); INIT_LIST_HEAD(&fs_info->caching_block_groups); INIT_LIST_HEAD(&fs_info->pending_raid_kobjs); spin_lock_init(&fs_info->pending_raid_kobjs_lock); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->cleaner_delayed_iput_mutex); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; atomic64_set(&fs_info->free_chunk_space, 0); fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); fs_info->btree_inode = new_inode(sb); if (!fs_info->btree_inode) { err = -ENOMEM; goto fail_bio_counter; } mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_KERNEL); if (!fs_info->delayed_root) { err = -ENOMEM; goto fail_iput; } btrfs_init_delayed_root(fs_info->delayed_root); btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; #endif btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); btrfs_init_btree_inode(fs_info); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; fs_info->first_logical_byte = (u64)-1; extent_io_tree_init(&fs_info->freed_extents[0], NULL); extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_BARRIER, &fs_info->flags); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); btrfs_init_dev_replace_locks(fs_info); btrfs_init_qgroup(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); INIT_LIST_HEAD(&fs_info->pinned_chunks); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; fs_info->sectorsize = 4096; fs_info->stripesize = 4096; ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; goto fail_alloc; } __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); invalidate_bdev(fs_devices->latest_bdev); /* * Read super block and check the signature bytes only */ bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (IS_ERR(bh)) { err = PTR_ERR(bh); goto fail_alloc; } /* * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(fs_info, bh->b_data)) { btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); goto fail_alloc; } /* * super_copy is zeroed at allocation time and we never touch the * following bytes up to INFO_SIZE, the checksum is calculated from * the whole block of INFO_SIZE */ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_for_commit)); brelse(bh); memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); ret = btrfs_validate_mount_super(fs_info); if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; goto fail_alloc; } disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); /* * run through our array of backup supers and setup * our ring pointer to the oldest one */ generation = btrfs_super_generation(disk_super); find_oldest_super_backup(fs_info, generation); /* * In the long term, we'll store the compression type in the super * block, and it'll be used for per file compression control. */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super) & ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { btrfs_err(fs_info, "cannot mount because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size */ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) btrfs_info(fs_info, "flagging fs with big metadata feature"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = sectorsize; fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* Cache block sizes */ fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->stripesize = stripesize; /* * mixed block groups end up with duplicate but slightly offset * extent buffers for the same range. It leads to corruptions */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); goto fail_alloc; } /* * Needn't use the lock because there is no other task which will * update the flag. */ btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!sb_rdonly(sb) && features) { btrfs_err(fs_info, "cannot mount read-write because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; } /* * We have unsupported RO compat features, although RO mounted, we * should not cause any metadata write, including log replay. * Or we could screw up whatever the new feature requires. */ if (unlikely(features && btrfs_super_log_root(disk_super) && !btrfs_test_opt(fs_info, NOLOGREPLAY))) { btrfs_err(fs_info, "cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", features); err = -EINVAL; goto fail_alloc; } ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; goto fail_sb_buffer; } sb->s_bdi->congested_fn = btrfs_congested_fn; sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_err(fs_info, "failed to read the system array: %d", ret); goto fail_sb_buffer; } generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), generation, level, NULL); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { btrfs_err(fs_info, "failed to read chunk root"); if (!IS_ERR(chunk_root->node)) free_extent_buffer(chunk_root->node); chunk_root->node = NULL; goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); ret = btrfs_read_chunk_tree(fs_info); if (ret) { btrfs_err(fs_info, "failed to read chunk tree: %d", ret); goto fail_tree_roots; } /* * Keep the devid that is marked to be the target device for the * device replace procedure */ btrfs_free_extra_devids(fs_devices, 0); if (!fs_devices->latest_bdev) { btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } retry_root_backup: generation = btrfs_super_generation(disk_super); level = btrfs_super_root_level(disk_super); tree_root->node = read_tree_block(fs_info, btrfs_super_root(disk_super), generation, level, NULL); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { btrfs_warn(fs_info, "failed to read tree root"); if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL; goto recovery_tree_root; } btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); mutex_lock(&tree_root->objectid_mutex); ret = btrfs_find_highest_objectid(tree_root, &tree_root->highest_objectid); if (ret) { mutex_unlock(&tree_root->objectid_mutex); goto recovery_tree_root; } ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&tree_root->objectid_mutex); ret = btrfs_read_roots(fs_info); if (ret) goto recovery_tree_root; fs_info->generation = generation; fs_info->last_trans_committed = generation; /* * If we have a uuid root and we're not being told to rescan we need to * check the generation here so we can set the * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the * transaction during a balance or the log replay without updating the * uuid generation, and then if we crash we would rescan the uuid tree, * even though it was perfectly fine. */ if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); ret = btrfs_verify_dev_extents(fs_info); if (ret) { btrfs_err(fs_info, "failed to verify dev extents against chunks: %d", ret); goto fail_block_groups; } ret = btrfs_recover_balance(fs_info); if (ret) { btrfs_err(fs_info, "failed to recover balance: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { btrfs_err(fs_info, "failed to init dev_stats: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { btrfs_err(fs_info, "failed to init dev_replace: %d", ret); goto fail_block_groups; } btrfs_free_extra_devids(fs_devices, 1); ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", ret); goto fail_block_groups; } ret = btrfs_sysfs_add_device(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs device interface: %d", ret); goto fail_fsdev_sysfs; } ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); if (ret) { btrfs_err(fs_info, "failed to initialize space info: %d", ret); goto fail_sysfs; } ret = btrfs_read_block_groups(fs_info); if (ret) { btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writeable mount is not allowed due to too many missing devices"); goto fail_sysfs; } fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) goto fail_sysfs; fs_info->transaction_kthread = kthread_run(transaction_kthread, tree_root, "btrfs-transaction"); if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); } /* * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit */ btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(fs_info, fs_devices, btrfs_test_opt(fs_info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); if (ret) btrfs_warn(fs_info, "failed to initialize integrity check module: %d", ret); } #endif ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; if (btrfs_build_ref_tree(fs_info)) btrfs_err(fs_info, "couldn't build ref tree"); /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; goto fail_qgroup; } } ret = btrfs_find_orphan_roots(fs_info); if (ret) goto fail_qgroup; if (!sb_rdonly(sb)) { ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto fail_qgroup; mutex_lock(&fs_info->cleaner_mutex); ret = btrfs_recover_relocation(tree_root); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); err = -EINVAL; goto fail_qgroup; } } location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); fs_info->fs_root = NULL; goto fail_qgroup; } if (sb_rdonly(sb)) return 0; if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { clear_free_space_tree = 1; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); clear_free_space_tree = 1; } if (clear_free_space_tree) { btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to clear free space tree: %d", ret); close_ctree(fs_info); return ret; } } if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to create free space tree: %d", ret); close_ctree(fs_info); return ret; } } down_read(&fs_info->cleanup_work_sem); if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { up_read(&fs_info->cleanup_work_sem); close_ctree(fs_info); return ret; } up_read(&fs_info->cleanup_work_sem); ret = btrfs_resume_balance_async(fs_info); if (ret) { btrfs_warn(fs_info, "failed to resume balance: %d", ret); close_ctree(fs_info); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(fs_info); return ret; } btrfs_qgroup_rescan_resume(fs_info); if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to create the UUID tree: %d", ret); close_ctree(fs_info); return ret; } } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to check the UUID tree: %d", ret); close_ctree(fs_info); return ret; } } set_bit(BTRFS_FS_OPEN, &fs_info->flags); /* * backuproot only affect mount behavior, and if open_ctree succeeded, * no need to keep the flag */ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); return 0; fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); btrfs_cleanup_transaction(fs_info); btrfs_free_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); /* * make sure we're done with the btree inode before we stop our * kthreads */ filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: btrfs_sysfs_remove_fsid(fs_info->fs_devices); fail_block_groups: btrfs_put_block_group_cache(fs_info); fail_tree_roots: free_root_pointers(fs_info, true); invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); iput(fs_info->btree_inode); fail_bio_counter: percpu_counter_destroy(&fs_info->bio_counter); fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: btrfs_free_stripe_hash_table(fs_info); btrfs_close_devices(fs_info->fs_devices); return err; recovery_tree_root: if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, false); /* don't use the log in recovery mode, it won't be valid */ btrfs_set_super_log_root(disk_super, 0); /* we can't trust the free space cache either */ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); ret = next_root_backup(fs_info, fs_info->super_copy, &num_backups_tried, &backup_index); if (ret == -1) goto fail_block_groups; goto retry_root_backup; } ALLOW_ERROR_INJECTION(open_ctree, ERRNO); static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); } else { struct btrfs_device *device = (struct btrfs_device *) bh->b_private; btrfs_warn_rl_in_rcu(device->fs_info, "lost page write due to IO error on %s", rcu_str_deref(device->name)); /* note, we don't set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS); } unlock_buffer(bh); put_bh(bh); } int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, struct buffer_head **bh_ret) { struct buffer_head *bh; struct btrfs_super_block *super; u64 bytenr; bytenr = btrfs_sb_offset(copy_num); if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) return -EINVAL; bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); /* * If we fail to read from the underlying devices, as of now * the best option we have is to mark it EIO. */ if (!bh) return -EIO; super = (struct btrfs_super_block *)bh->b_data; if (btrfs_super_bytenr(super) != bytenr || btrfs_super_magic(super) != BTRFS_MAGIC) { brelse(bh); return -EINVAL; } *bh_ret = bh; return 0; } struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) { struct buffer_head *bh; struct buffer_head *latest = NULL; struct btrfs_super_block *super; int i; u64 transid = 0; int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. * So, we need to add a special mount option to scan for * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { ret = btrfs_read_dev_one_super(bdev, i, &bh); if (ret) continue; super = (struct btrfs_super_block *)bh->b_data; if (!latest || btrfs_super_generation(super) > transid) { brelse(latest); latest = bh; transid = btrfs_super_generation(super); } else { brelse(bh); } } if (!latest) return ERR_PTR(ret); return latest; } /* * Write superblock @sb to the @device. Do not wait for completion, all the * buffer heads we write are pinned. * * Write @max_mirrors copies of the superblock, where 0 means default that fit * the expected device size at commit time. Note that max_mirrors must be * same for write and wait phases. * * Return number of errors when buffer head is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { struct buffer_head *bh; int i; int ret; int errors = 0; u32 crc; u64 bytenr; int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; btrfs_set_super_bytenr(sb, bytenr); crc = ~(u32)0; crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); btrfs_csum_final(crc, sb->csum); /* One reference for us, and we leave it for the caller */ bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); if (!bh) { btrfs_err(device->fs_info, "couldn't get super buffer head for bytenr %llu", bytenr); errors++; continue; } memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); /* one reference for submit_bh */ get_bh(bh); set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; bh->b_private = device; /* * we fua the first super. The others we allow * to go down lazy. */ op_flags = REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) op_flags |= REQ_FUA; ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); if (ret) errors++; } return errors < i ? 0 : -1; } /* * Wait for write completion of superblocks done by write_dev_supers, * @max_mirrors same for write and wait phases. * * Return number of errors when buffer head is not found or not marked up to * date. */ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) { struct buffer_head *bh; int i; int errors = 0; bool primary_failed = false; u64 bytenr; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->commit_total_bytes) break; bh = __find_get_block(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); if (!bh) { errors++; if (i == 0) primary_failed = true; continue; } wait_on_buffer(bh); if (!buffer_uptodate(bh)) { errors++; if (i == 0) primary_failed = true; } /* drop our reference */ brelse(bh); /* drop the reference from the writing run */ brelse(bh); } /* log error, force error return */ if (primary_failed) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", device->devid); return -1; } return errors < i ? 0 : -1; } /* * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done */ static void btrfs_end_empty_barrier(struct bio *bio) { complete(bio->bi_private); } /* * Submit a flush request to the device if it supports it. Error handling is * done in the waiting counterpart. */ static void write_dev_flush(struct btrfs_device *device) { struct bio *bio = device->flush_bio; #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* * When a disk has write caching disabled, we skip submission of a bio * with flush and sync requests before writing the superblock, since * it's not needed. However when the integrity checker is enabled, this * results in reports that there are metadata blocks referred by a * superblock that were not properly flushed. So don't skip the bio * submission only when the integrity checker is enabled for the sake * of simplicity, since this is a debug tool and not meant for use in * non-debug builds. */ struct request_queue *q = bdev_get_queue(device->bdev); if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return; #endif bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; bio_set_dev(bio, device->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; btrfsic_submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } /* * If the flush bio has been submitted by write_dev_flush, wait for it. */ static blk_status_t wait_dev_flush(struct btrfs_device *device) { struct bio *bio = device->flush_bio; if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); wait_for_completion_io(&device->flush_wait); return bio->bi_status; } static int check_barrier_error(struct btrfs_fs_info *fs_info) { if (!btrfs_check_rw_degradable(fs_info, NULL)) return -EIO; return 0; } /* * send an empty flush down to each device in parallel, * then wait for them */ static int barrier_all_devices(struct btrfs_fs_info *info) { struct list_head *head; struct btrfs_device *dev; int errors_wait = 0; blk_status_t ret; lockdep_assert_held(&info->fs_devices->device_list_mutex); /* send down all the barriers */ head = &info->fs_devices->devices; list_for_each_entry(dev, head, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) continue; if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; write_dev_flush(dev); dev->last_flush_error = BLK_STS_OK; } /* wait for all the barriers */ list_for_each_entry(dev, head, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; if (!dev->bdev) { errors_wait++; continue; } if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_flush(dev); if (ret) { dev->last_flush_error = ret; btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); errors_wait++; } } if (errors_wait) { /* * At some point we need the status of all disks * to arrive at the volume status. So error checking * is being pushed to a separate loop. */ return check_barrier_error(info); } return 0; } int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) { int raid_type; int min_tolerated = INT_MAX; if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) min_tolerated = min(min_tolerated, btrfs_raid_array[BTRFS_RAID_SINGLE]. tolerated_failures); for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { if (raid_type == BTRFS_RAID_SINGLE) continue; if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; min_tolerated = min(min_tolerated, btrfs_raid_array[raid_type]. tolerated_failures); } if (min_tolerated == INT_MAX) { pr_warn("BTRFS: unknown raid flag: %llu", flags); min_tolerated = 0; } return min_tolerated; } int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) { struct list_head *head; struct btrfs_device *dev; struct btrfs_super_block *sb; struct btrfs_dev_item *dev_item; int ret; int do_barriers; int max_errors; int total_errors = 0; u64 flags; do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); /* * max_mirrors == 0 indicates we're from commit_transaction, * not from fsync where the tree roots in fs_info have not * been consistent on disk. */ if (max_mirrors == 0) backup_super_roots(fs_info); sb = fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&fs_info->fs_devices->device_list_mutex); head = &fs_info->fs_devices->devices; max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1; if (do_barriers) { ret = barrier_all_devices(fs_info); if (ret) { mutex_unlock( &fs_info->fs_devices->device_list_mutex); btrfs_handle_fs_error(fs_info, ret, "errors while submitting device barriers."); return ret; } } list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; } if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; btrfs_set_stack_device_generation(dev_item, 0); btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); btrfs_set_stack_device_total_bytes(dev_item, dev->commit_total_bytes); btrfs_set_stack_device_bytes_used(dev_item, dev->commit_bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE); flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); ret = btrfs_validate_write_super(fs_info, sb); if (ret < 0) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_handle_fs_error(fs_info, -EUCLEAN, "unexpected superblock corruption detected"); return -EUCLEAN; } ret = write_dev_supers(dev, sb, max_mirrors); if (ret) total_errors++; } if (total_errors > max_errors) { btrfs_err(fs_info, "%d errors while writing supers", total_errors); mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ btrfs_handle_fs_error(fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } total_errors = 0; list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) continue; if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_supers(dev, max_mirrors); if (ret) total_errors++; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { btrfs_handle_fs_error(fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } return 0; } /* Drop a fs root from the radix tree and free it. */ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); spin_unlock(&fs_info->fs_roots_radix_lock); if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); if (root->reloc_root) { free_extent_buffer(root->reloc_root->node); free_extent_buffer(root->reloc_root->commit_root); btrfs_put_fs_root(root->reloc_root); root->reloc_root = NULL; } } if (root->free_ino_pinned) __btrfs_remove_free_space_cache(root->free_ino_pinned); if (root->free_ino_ctl) __btrfs_remove_free_space_cache(root->free_ino_ctl); btrfs_free_fs_root(root); } void btrfs_free_fs_root(struct btrfs_root *root) { iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); if (root->anon_dev) free_anon_bdev(root->anon_dev); if (root->subv_writers) btrfs_free_subvolume_writers(root->subv_writers); free_extent_buffer(root->node); free_extent_buffer(root->commit_root); kfree(root->free_ino_ctl); kfree(root->free_ino_pinned); btrfs_put_fs_root(root); } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; struct btrfs_root *gang[8]; int i = 0; int err = 0; unsigned int ret = 0; int index; while (1) { index = srcu_read_lock(&fs_info->subvol_srcu); ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, root_objectid, ARRAY_SIZE(gang)); if (!ret) { srcu_read_unlock(&fs_info->subvol_srcu, index); break; } root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { /* Avoid to grab roots in dead_roots */ if (btrfs_root_refs(&gang[i]->root_item) == 0) { gang[i] = NULL; continue; } /* grab all the search result for later use */ gang[i] = btrfs_grab_fs_root(gang[i]); } srcu_read_unlock(&fs_info->subvol_srcu, index); for (i = 0; i < ret; i++) { if (!gang[i]) continue; root_objectid = gang[i]->root_key.objectid; err = btrfs_orphan_cleanup(gang[i]); if (err) break; btrfs_put_fs_root(gang[i]); } root_objectid++; } /* release the uncleaned roots due to error */ for (; i < ret; i++) { if (gang[i]) btrfs_put_fs_root(gang[i]); } return err; } int btrfs_commit_super(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); wake_up_process(fs_info->cleaner_kthread); /* wait until ongoing cleanup work done */ down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); return btrfs_commit_transaction(trans); } void close_ctree(struct btrfs_fs_info *fs_info) { int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet * because that frees the task_struct, and the transaction kthread might * still try to wake up the cleaner. */ kthread_park(fs_info->cleaner_kthread); /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); /* avoid complains from lockdep et al., set sem back to initial state */ up(&fs_info->uuid_tree_rescan_sem); /* pause restriper - we want to resume on mount */ btrfs_pause_balance(fs_info); btrfs_dev_replace_suspend_for_unmount(fs_info); btrfs_scrub_cancel(fs_info); /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, (atomic_read(&fs_info->defrag_running) == 0)); /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); cancel_work_sync(&fs_info->async_reclaim_work); if (!sb_rdonly(fs_info->sb)) { /* * The cleaner kthread is stopped, so do one final pass over * unused block groups. */ btrfs_delete_unused_bgs(fs_info); /* * There might be existing delayed inode workers still running * and holding an empty delayed inode item. We must wait for * them to complete first because they can create a transaction. * This happens when someone calls btrfs_balance_delayed_items() * and then a transaction commit runs the same delayed nodes * before any delayed worker has done something with the nodes. * We must wait for any worker here and not at transaction * commit time since that could cause a deadlock. * This is a very rare case. */ btrfs_flush_workqueue(fs_info->delayed_workers); ret = btrfs_commit_super(fs_info); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) || test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) btrfs_error_commit_super(fs_info); kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); btrfs_free_qgroup_config(fs_info); ASSERT(list_empty(&fs_info->delalloc_roots)); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { btrfs_info(fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); btrfs_put_block_group_cache(fs_info); /* * we must make sure there is not any read request to * submit after we stopping all workers. */ invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); /* * We must free the block groups after dropping the fs_roots as we could * have had an IO error and have left over tree log blocks that aren't * cleaned up until the fs roots are freed. This makes the block group * accounting appear to be wrong because there's pending reserved bytes, * so make sure we do the block group cleanup afterwards. */ btrfs_free_block_groups(fs_info); iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) btrfsic_unmount(fs_info->fs_devices); #endif btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->bio_counter); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); btrfs_free_ref_cache(fs_info); while (!list_empty(&fs_info->pinned_chunks)) { struct extent_map *em; em = list_first_entry(&fs_info->pinned_chunks, struct extent_map, list); list_del_init(&em->list); free_extent_map(em); } } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic) { int ret; struct inode *btree_inode = buf->pages[0]->mapping->host; ret = extent_buffer_uptodate(buf); if (!ret) return ret; ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, parent_transid, atomic); if (ret == -EAGAIN) return ret; return !ret; } void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_fs_info *fs_info; struct btrfs_root *root; u64 transid = btrfs_header_generation(buf); int was_dirty; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * This is a fast path so only do this check if we have sanity tests * enabled. Normal people shouldn't be using umapped buffers as dirty * outside of the sanity tests. */ if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif root = BTRFS_I(buf->pages[0]->mapping->host)->root; fs_info = root->fs_info; btrfs_assert_tree_locked(buf); if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", buf->start, transid, fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, buf->len, fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* * Since btrfs_mark_buffer_dirty() can be called with item pointer set * but item data not updated. * So here we should only check item pointers, not item data. */ if (btrfs_header_level(buf) == 0 && btrfs_check_leaf_relaxed(fs_info, buf)) { btrfs_print_leaf(buf); ASSERT(0); } #endif } static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, int flush_delayed) { /* * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ int ret; if (current->flags & PF_MEMALLOC) return; if (flush_delayed) btrfs_balance_delayed_items(fs_info); ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH, fs_info->dirty_metadata_batch); if (ret > 0) { balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping); } } void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info) { __btrfs_btree_balance_dirty(fs_info, 1); } void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) { __btrfs_btree_balance_dirty(fs_info, 0); } int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key) { struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; struct btrfs_fs_info *fs_info = root->fs_info; return btree_read_extent_buffer_pages(fs_info, buf, parent_transid, level, first_key); } static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); mutex_lock(&fs_info->cleaner_mutex); btrfs_run_delayed_iputs(fs_info); mutex_unlock(&fs_info->cleaner_mutex); down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; spin_lock(&root->ordered_extent_lock); /* * This will just short circuit the ordered completion stuff which will * make sure the ordered extent gets properly cleaned up. */ list_for_each_entry(ordered, &root->ordered_extents, root_extent_list) set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); spin_unlock(&root->ordered_extent_lock); } static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; struct list_head splice; INIT_LIST_HEAD(&splice); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, ordered_root); list_move_tail(&root->ordered_root, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); btrfs_destroy_ordered_extents(root); cond_resched(); spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); /* * We need this here because if we've been flipped read-only we won't * get sync() from the umount, so we need to make sure any ordered * extents that haven't had their dirty pages IO start writeout yet * actually get run and error out properly. */ btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; int ret = 0; delayed_refs = &trans->delayed_refs; spin_lock(&delayed_refs->lock); if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); btrfs_info(fs_info, "delayed_refs has NO entry"); return ret; } while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; struct rb_node *n; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); spin_lock(&delayed_refs->lock); continue; } spin_lock(&head->lock); while ((n = rb_first(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); ref->in_tree = 0; rb_erase(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } if (head->must_insert_reserved) pin_bytes = true; btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); rb_erase(&head->href_node, &delayed_refs->href_root); RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); if (pin_bytes) btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } spin_unlock(&delayed_refs->lock); return ret; } static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; struct list_head splice; INIT_LIST_HEAD(&splice); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { struct inode *inode = NULL; btrfs_inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); __btrfs_del_delalloc_inode(root, btrfs_inode); spin_unlock(&root->delalloc_lock); /* * Make sure we get a live inode and that it'll not disappear * meanwhile. */ inode = igrab(&btrfs_inode->vfs_inode); if (inode) { invalidate_inode_pages2(inode->i_mapping); iput(inode); } spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); } static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; struct list_head splice; INIT_LIST_HEAD(&splice); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_fs_root(root); BUG_ON(!root); spin_unlock(&fs_info->delalloc_root_lock); btrfs_destroy_delalloc_inodes(root); btrfs_put_fs_root(root); spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); } static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { int ret; struct extent_buffer *eb; u64 start = 0; u64 end; while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, mark, NULL); if (ret) break; clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; if (!eb) continue; wait_on_extent_buffer_writeback(eb); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) clear_extent_buffer_dirty(eb); free_extent_buffer_stale(eb); } } return ret; } static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, struct extent_io_tree *pinned_extents) { struct extent_io_tree *unpin; u64 start; u64 end; int ret; bool loop = true; unpin = pinned_extents; again: while (1) { struct extent_state *cached_state = NULL; /* * The btrfs_finish_extent_commit() may get the same range as * ours between find_first_extent_bit and clear_extent_dirty. * Hence, hold the unused_bg_unpin_mutex to avoid double unpin * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } clear_extent_dirty(unpin, start, end, &cached_state); free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } if (loop) { if (unpin == &fs_info->freed_extents[0]) unpin = &fs_info->freed_extents[1]; else unpin = &fs_info->freed_extents[0]; loop = false; goto again; } return 0; } static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) { struct inode *inode; inode = cache->io_ctl.inode; if (inode) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; cache->io_ctl.inode = NULL; iput(inode); } ASSERT(cache->io_ctl.pages == NULL); btrfs_put_block_group(cache); } void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { struct btrfs_block_group_cache *cache; spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, dirty_list); if (!list_empty(&cache->io_list)) { spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); btrfs_cleanup_bg_io(cache); spin_lock(&cur_trans->dirty_bgs_lock); } list_del_init(&cache->dirty_list); spin_lock(&cache->lock); cache->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&cache->lock); spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); /* * Refer to the definition of io_bgs member for details why it's safe * to use it without any locking */ while (!list_empty(&cur_trans->io_bgs)) { cache = list_first_entry(&cur_trans->io_bgs, struct btrfs_block_group_cache, io_list); list_del_init(&cache->io_list); spin_lock(&cache->lock); cache->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&cache->lock); btrfs_cleanup_bg_io(cache); } } void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { btrfs_cleanup_dirty_bgs(cur_trans, fs_info); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); btrfs_destroy_delayed_refs(cur_trans, fs_info); cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&fs_info->transaction_wait); btrfs_destroy_delayed_inodes(fs_info); btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) { struct btrfs_transaction *t; mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); while (!list_empty(&fs_info->trans_list)) { t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); if (t->state >= TRANS_STATE_COMMIT_START) { refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); btrfs_put_transaction(t); spin_lock(&fs_info->trans_lock); continue; } if (t == fs_info->running_transaction) { t->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); /* * We wait for 0 num_writers since we don't hold a trans * handle open currently for this transaction. */ wait_event(t->writer_wait, atomic_read(&t->num_writers) == 0); } else { spin_unlock(&fs_info->trans_lock); } btrfs_cleanup_one_transaction(t, fs_info); spin_lock(&fs_info->trans_lock); if (t == fs_info->running_transaction) fs_info->running_transaction = NULL; list_del_init(&t->list); spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(t); trace_btrfs_transaction_commit(fs_info->tree_root); spin_lock(&fs_info->trans_lock); } spin_unlock(&fs_info->trans_lock); btrfs_destroy_all_ordered_extents(fs_info); btrfs_destroy_delayed_inodes(fs_info); btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); btrfs_destroy_all_delalloc_inodes(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; } static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, /* optional callbacks */ }; |
12 14 14 12 5 14 14 14 16 77 76 77 74 5 77 11 7 4 11 7 7 5 5 11 72 73 72 72 72 72 6 72 72 72 16 15 12 11 10 10 10 10 7 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 | /* * linux/drivers/video/fbcmap.c -- Colormap handling for frame buffer devices * * Created 15 Jun 1997 by Geert Uytterhoeven * * 2001 - Documented with DocBook * - Brad Douglas <brad@neruo.com> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for * more details. */ #include <linux/string.h> #include <linux/module.h> #include <linux/fb.h> #include <linux/slab.h> #include <linux/uaccess.h> static u16 red2[] __read_mostly = { 0x0000, 0xaaaa }; static u16 green2[] __read_mostly = { 0x0000, 0xaaaa }; static u16 blue2[] __read_mostly = { 0x0000, 0xaaaa }; static u16 red4[] __read_mostly = { 0x0000, 0xaaaa, 0x5555, 0xffff }; static u16 green4[] __read_mostly = { 0x0000, 0xaaaa, 0x5555, 0xffff }; static u16 blue4[] __read_mostly = { 0x0000, 0xaaaa, 0x5555, 0xffff }; static u16 red8[] __read_mostly = { 0x0000, 0x0000, 0x0000, 0x0000, 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa }; static u16 green8[] __read_mostly = { 0x0000, 0x0000, 0xaaaa, 0xaaaa, 0x0000, 0x0000, 0x5555, 0xaaaa }; static u16 blue8[] __read_mostly = { 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x0000, 0xaaaa }; static u16 red16[] __read_mostly = { 0x0000, 0x0000, 0x0000, 0x0000, 0xaaaa, 0xaaaa, 0xaaaa, 0xaaaa, 0x5555, 0x5555, 0x5555, 0x5555, 0xffff, 0xffff, 0xffff, 0xffff }; static u16 green16[] __read_mostly = { 0x0000, 0x0000, 0xaaaa, 0xaaaa, 0x0000, 0x0000, 0x5555, 0xaaaa, 0x5555, 0x5555, 0xffff, 0xffff, 0x5555, 0x5555, 0xffff, 0xffff }; static u16 blue16[] __read_mostly = { 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x0000, 0xaaaa, 0x5555, 0xffff, 0x5555, 0xffff, 0x5555, 0xffff, 0x5555, 0xffff }; static const struct fb_cmap default_2_colors = { .len=2, .red=red2, .green=green2, .blue=blue2 }; static const struct fb_cmap default_8_colors = { .len=8, .red=red8, .green=green8, .blue=blue8 }; static const struct fb_cmap default_4_colors = { .len=4, .red=red4, .green=green4, .blue=blue4 }; static const struct fb_cmap default_16_colors = { .len=16, .red=red16, .green=green16, .blue=blue16 }; /** * fb_alloc_cmap - allocate a colormap * @cmap: frame buffer colormap structure * @len: length of @cmap * @transp: boolean, 1 if there is transparency, 0 otherwise * @flags: flags for kmalloc memory allocation * * Allocates memory for a colormap @cmap. @len is the * number of entries in the palette. * * Returns negative errno on error, or zero on success. * */ int fb_alloc_cmap_gfp(struct fb_cmap *cmap, int len, int transp, gfp_t flags) { int size = len * sizeof(u16); int ret = -ENOMEM; flags |= __GFP_NOWARN; if (cmap->len != len) { fb_dealloc_cmap(cmap); if (!len) return 0; cmap->red = kzalloc(size, flags); if (!cmap->red) goto fail; cmap->green = kzalloc(size, flags); if (!cmap->green) goto fail; cmap->blue = kzalloc(size, flags); if (!cmap->blue) goto fail; if (transp) { cmap->transp = kzalloc(size, flags); if (!cmap->transp) goto fail; } else { cmap->transp = NULL; } } cmap->start = 0; cmap->len = len; ret = fb_copy_cmap(fb_default_cmap(len), cmap); if (ret) goto fail; return 0; fail: fb_dealloc_cmap(cmap); return ret; } int fb_alloc_cmap(struct fb_cmap *cmap, int len, int transp) { return fb_alloc_cmap_gfp(cmap, len, transp, GFP_ATOMIC); } /** * fb_dealloc_cmap - deallocate a colormap * @cmap: frame buffer colormap structure * * Deallocates a colormap that was previously allocated with * fb_alloc_cmap(). * */ void fb_dealloc_cmap(struct fb_cmap *cmap) { kfree(cmap->red); kfree(cmap->green); kfree(cmap->blue); kfree(cmap->transp); cmap->red = cmap->green = cmap->blue = cmap->transp = NULL; cmap->len = 0; } /** * fb_copy_cmap - copy a colormap * @from: frame buffer colormap structure * @to: frame buffer colormap structure * * Copy contents of colormap from @from to @to. */ int fb_copy_cmap(const struct fb_cmap *from, struct fb_cmap *to) { unsigned int tooff = 0, fromoff = 0; size_t size; if (to->start > from->start) fromoff = to->start - from->start; else tooff = from->start - to->start; if (fromoff >= from->len || tooff >= to->len) return -EINVAL; size = min_t(size_t, to->len - tooff, from->len - fromoff); if (size == 0) return -EINVAL; size *= sizeof(u16); memcpy(to->red+tooff, from->red+fromoff, size); memcpy(to->green+tooff, from->green+fromoff, size); memcpy(to->blue+tooff, from->blue+fromoff, size); if (from->transp && to->transp) memcpy(to->transp+tooff, from->transp+fromoff, size); return 0; } int fb_cmap_to_user(const struct fb_cmap *from, struct fb_cmap_user *to) { unsigned int tooff = 0, fromoff = 0; size_t size; if (to->start > from->start) fromoff = to->start - from->start; else tooff = from->start - to->start; if (fromoff >= from->len || tooff >= to->len) return -EINVAL; size = min_t(size_t, to->len - tooff, from->len - fromoff); if (size == 0) return -EINVAL; size *= sizeof(u16); if (copy_to_user(to->red+tooff, from->red+fromoff, size)) return -EFAULT; if (copy_to_user(to->green+tooff, from->green+fromoff, size)) return -EFAULT; if (copy_to_user(to->blue+tooff, from->blue+fromoff, size)) return -EFAULT; if (from->transp && to->transp) if (copy_to_user(to->transp+tooff, from->transp+fromoff, size)) return -EFAULT; return 0; } /** * fb_set_cmap - set the colormap * @cmap: frame buffer colormap structure * @info: frame buffer info structure * * Sets the colormap @cmap for a screen of device @info. * * Returns negative errno on error, or zero on success. * */ int fb_set_cmap(struct fb_cmap *cmap, struct fb_info *info) { int i, start, rc = 0; u16 *red, *green, *blue, *transp; u_int hred, hgreen, hblue, htransp = 0xffff; red = cmap->red; green = cmap->green; blue = cmap->blue; transp = cmap->transp; start = cmap->start; if (start < 0 || (!info->fbops->fb_setcolreg && !info->fbops->fb_setcmap)) return -EINVAL; if (info->fbops->fb_setcmap) { rc = info->fbops->fb_setcmap(cmap, info); } else { for (i = 0; i < cmap->len; i++) { hred = *red++; hgreen = *green++; hblue = *blue++; if (transp) htransp = *transp++; if (info->fbops->fb_setcolreg(start++, hred, hgreen, hblue, htransp, info)) break; } } if (rc == 0) fb_copy_cmap(cmap, &info->cmap); return rc; } int fb_set_user_cmap(struct fb_cmap_user *cmap, struct fb_info *info) { int rc, size = cmap->len * sizeof(u16); struct fb_cmap umap; if (size < 0 || size < cmap->len) return -E2BIG; memset(&umap, 0, sizeof(struct fb_cmap)); rc = fb_alloc_cmap_gfp(&umap, cmap->len, cmap->transp != NULL, GFP_KERNEL); if (rc) return rc; if (copy_from_user(umap.red, cmap->red, size) || copy_from_user(umap.green, cmap->green, size) || copy_from_user(umap.blue, cmap->blue, size) || (cmap->transp && copy_from_user(umap.transp, cmap->transp, size))) { rc = -EFAULT; goto out; } umap.start = cmap->start; if (!lock_fb_info(info)) { rc = -ENODEV; goto out; } rc = fb_set_cmap(&umap, info); unlock_fb_info(info); out: fb_dealloc_cmap(&umap); return rc; } /** * fb_default_cmap - get default colormap * @len: size of palette for a depth * * Gets the default colormap for a specific screen depth. @len * is the size of the palette for a particular screen depth. * * Returns pointer to a frame buffer colormap structure. * */ const struct fb_cmap *fb_default_cmap(int len) { if (len <= 2) return &default_2_colors; if (len <= 4) return &default_4_colors; if (len <= 8) return &default_8_colors; return &default_16_colors; } /** * fb_invert_cmaps - invert all defaults colormaps * * Invert all default colormaps. * */ void fb_invert_cmaps(void) { u_int i; for (i = 0; i < ARRAY_SIZE(red2); i++) { red2[i] = ~red2[i]; green2[i] = ~green2[i]; blue2[i] = ~blue2[i]; } for (i = 0; i < ARRAY_SIZE(red4); i++) { red4[i] = ~red4[i]; green4[i] = ~green4[i]; blue4[i] = ~blue4[i]; } for (i = 0; i < ARRAY_SIZE(red8); i++) { red8[i] = ~red8[i]; green8[i] = ~green8[i]; blue8[i] = ~blue8[i]; } for (i = 0; i < ARRAY_SIZE(red16); i++) { red16[i] = ~red16[i]; green16[i] = ~green16[i]; blue16[i] = ~blue16[i]; } } /* * Visible symbols for modules */ EXPORT_SYMBOL(fb_alloc_cmap); EXPORT_SYMBOL(fb_dealloc_cmap); EXPORT_SYMBOL(fb_copy_cmap); EXPORT_SYMBOL(fb_set_cmap); EXPORT_SYMBOL(fb_default_cmap); EXPORT_SYMBOL(fb_invert_cmaps); |
3 258 1 200 90 137 77 195 184 104 28 1 29 163 38 7 31 9 185 225 121 227 235 236 240 38 13 229 184 240 184 240 179 141 111 8 141 112 10 97 187 187 228 187 187 28 187 186 187 187 187 26 206 38 187 186 223 23 38 154 105 187 138 24 180 24 184 93 92 132 36 35 21 1 4 29 126 1 1 1 81 17 131 189 147 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/highmem.h> #include <linux/fs.h> #include <linux/rwsem.h> #include <linux/semaphore.h> #include <linux/completion.h> #include <linux/backing-dev.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/kobject.h> #include <trace/events/btrfs.h> #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> #include <linux/btrfs_tree.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/sizes.h> #include <linux/dynamic_debug.h> #include <linux/refcount.h> #include <linux/crc32c.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS #define STATIC noinline #else #define STATIC static noinline #endif #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ #define BTRFS_MAX_MIRRORS 3 #define BTRFS_MAX_LEVEL 8 #define BTRFS_OLDEST_GENERATION 0ULL /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. */ #define BTRFS_MAX_METADATA_BLOCKSIZE 65536 /* * we can actually store much bigger names, but lets not confuse the rest * of linux */ #define BTRFS_NAME_LEN 255 /* * Theoretical limit is larger, but we keep this down to a sane * value. That should limit greatly the possibility of collisions on * inode ref items. */ #define BTRFS_LINK_MAX 65535U /* four bytes for CRC32 */ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_EMPTY_DIR_SIZE 0 /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) #define BTRFS_DIRTY_METADATA_THRESH SZ_32M /* * Use large batch size to reduce overhead of metadata updates. On the reader * side, we only read it when we are close to ENOSPC and the read overhead is * mostly related to the number of CPUs, so it is OK to use arbitrary large * value here. */ #define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M #define BTRFS_MAX_EXTENT_SIZE SZ_128M /* * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size */ static inline u32 count_max_extents(u64 size) { return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); } struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); return sizeof(struct btrfs_chunk) + sizeof(struct btrfs_stripe) * (num_stripes - 1); } /* * File system states */ #define BTRFS_FS_STATE_ERROR 0 #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 #define BTRFS_FS_STATE_DUMMY_FS_INFO 4 #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 #define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ BTRFS_BACKREF_REV_SHIFT) #define BTRFS_OLD_BACKREF_REV 0 #define BTRFS_MIXED_BACKREF_REV 1 /* * every tree block (leaf or node) starts with this header. */ struct btrfs_header { /* these first four must match the super block */ u8 csum[BTRFS_CSUM_SIZE]; u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ __le64 bytenr; /* which block this node is supposed to live in */ __le64 flags; /* allowed to be different from the super from here on down */ u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; __le64 generation; __le64 owner; __le32 nritems; u8 level; } __attribute__ ((__packed__)); /* * this is a very generous portion of the super block, giving us * room to translate 14 chunks with 3 stripes each. */ #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 /* * just in case we somehow lose the roots and are not able to mount, * we store an array of the roots from previous transactions * in the super. */ #define BTRFS_NUM_BACKUP_ROOTS 4 struct btrfs_root_backup { __le64 tree_root; __le64 tree_root_gen; __le64 chunk_root; __le64 chunk_root_gen; __le64 extent_root; __le64 extent_root_gen; __le64 fs_root; __le64 fs_root_gen; __le64 dev_root; __le64 dev_root_gen; __le64 csum_root; __le64 csum_root_gen; __le64 total_bytes; __le64 bytes_used; __le64 num_devices; /* future */ __le64 unused_64[4]; u8 tree_root_level; u8 chunk_root_level; u8 extent_root_level; u8 fs_root_level; u8 dev_root_level; u8 csum_root_level; /* future and to align */ u8 unused_8[10]; } __attribute__ ((__packed__)); /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ struct btrfs_super_block { u8 csum[BTRFS_CSUM_SIZE]; /* the first 4 fields must match struct btrfs_header */ u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ __le64 bytenr; /* this block number */ __le64 flags; /* allowed to be different from the btrfs_header from here own down */ __le64 magic; __le64 generation; __le64 root; __le64 chunk_root; __le64 log_root; /* this will help find the new super based on the log root */ __le64 log_root_transid; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; __le64 num_devices; __le32 sectorsize; __le32 nodesize; __le32 __unused_leafsize; __le32 stripesize; __le32 sys_chunk_array_size; __le64 chunk_root_generation; __le64 compat_flags; __le64 compat_ro_flags; __le64 incompat_flags; __le16 csum_type; u8 root_level; u8 chunk_root_level; u8 log_root_level; struct btrfs_dev_item dev_item; char label[BTRFS_LABEL_SIZE]; __le64 cache_generation; __le64 uuid_tree_generation; /* future expansion */ __le64 reserved[30]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); /* * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP \ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL /* * A leaf is full of items. offset and size tell us where to find * the item in the leaf (relative to the start of the data area) */ struct btrfs_item { struct btrfs_disk_key key; __le32 offset; __le32 size; } __attribute__ ((__packed__)); /* * leaves have an item area and a data area: * [item0, item1....itemN] [free space] [dataN...data1, data0] * * The data is separate from the items to get the keys closer together * during searches. */ struct btrfs_leaf { struct btrfs_header header; struct btrfs_item items[]; } __attribute__ ((__packed__)); /* * all non-leaf blocks are nodes, they hold only keys and pointers to * other blocks */ struct btrfs_key_ptr { struct btrfs_disk_key key; __le64 blockptr; __le64 generation; } __attribute__ ((__packed__)); struct btrfs_node { struct btrfs_header header; struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); /* * btrfs_paths remember the path taken from the root down to the leaf. * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point * to any other levels that are present. * * The slots array records the index of the item or block pointer * used while walking the tree. */ enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ u8 locks[BTRFS_MAX_LEVEL]; u8 reada; /* keep some upper locks as we walk down */ u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ unsigned int search_for_split:1; unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) struct btrfs_dev_replace { u64 replace_state; /* see #define above */ time64_t time_started; /* seconds since 1-Jan-1970 */ time64_t time_stopped; /* seconds since 1-Jan-1970 */ atomic64_t num_write_errors; atomic64_t num_uncorrectable_read_errors; u64 cursor_left; u64 committed_cursor_left; u64 cursor_left_last_write_of_item; u64 cursor_right; u64 cont_reading_from_srcdev_mode; /* see #define above */ int is_valid; int item_needs_writeback; struct btrfs_device *srcdev; struct btrfs_device *tgtdev; struct mutex lock_finishing_cancel_unmount; rwlock_t lock; atomic_t read_locks; atomic_t blocking_readers; wait_queue_head_t read_lock_wq; struct btrfs_scrub_progress scrub_progress; }; /* For raid type sysfs entries */ struct raid_kobject { u64 flags; struct kobject kobj; struct list_head list; }; struct btrfs_space_info { spinlock_t lock; u64 total_bytes; /* total bytes in the space, this doesn't take mirrors into account */ u64 bytes_used; /* total bytes used, this doesn't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ unsigned int flush:1; /* set if we are trying to make space */ unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ u64 disk_used; /* total bytes used on disk */ u64 disk_total; /* total bytes on disk, takes mirrors into account */ u64 flags; /* * bytes_pinned is kept in line with what is actually pinned, as in * we've called update_block_group and dropped the bytes_used counter * and increased the bytes_pinned counter. However this means that * bytes_pinned does not reflect the bytes that will be pinned once the * delayed refs are flushed, so this counter is inc'ed every time we * call btrfs_free_extent so it is a realtime count of what will be * freed once the transaction is committed. It will be zeroed every * time the transaction commits. */ struct percpu_counter total_bytes_pinned; struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; /* * tickets_id just indicates the next ticket will be handled, so note * it's not stored per ticket. */ u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ struct list_head block_groups[BTRFS_NR_RAID_TYPES]; wait_queue_head_t wait; struct kobject kobj; struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; #define BTRFS_BLOCK_RSV_GLOBAL 1 #define BTRFS_BLOCK_RSV_DELALLOC 2 #define BTRFS_BLOCK_RSV_TRANS 3 #define BTRFS_BLOCK_RSV_CHUNK 4 #define BTRFS_BLOCK_RSV_DELOPS 5 #define BTRFS_BLOCK_RSV_EMPTY 6 #define BTRFS_BLOCK_RSV_TEMP 7 struct btrfs_block_rsv { u64 size; u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; unsigned short full; unsigned short type; unsigned short failfast; /* * Qgroup equivalent for @size @reserved * * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care * about things like csum size nor how many tree blocks it will need to * reserve. * * Qgroup cares more about net change of the extent usage. * * So for one newly inserted file extent, in worst case it will cause * leaf split and level increase, nodesize for each file extent is * already too much. * * In short, qgroup_size/reserved is the upper limit of possible needed * qgroup metadata reservation. */ u64 qgroup_rsv_size; u64 qgroup_rsv_reserved; }; /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata * allocations. In ssd_spread mode they are also used for data allocations. */ struct btrfs_free_cluster { spinlock_t lock; spinlock_t refill_lock; struct rb_root root; /* largest extent in this cluster */ u64 max_size; /* first extent starting offset */ u64 window_start; /* We did a full search and couldn't create a cluster */ bool fragmented; struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the * cluster onto a list in the block group so that it can * be freed before the block group is freed. */ struct list_head block_group_list; }; enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, BTRFS_CACHE_FAST = 2, BTRFS_CACHE_FINISHED = 3, BTRFS_CACHE_ERROR = 4, }; enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN = 0, BTRFS_DC_ERROR = 1, BTRFS_DC_CLEAR = 2, BTRFS_DC_SETUP = 3, }; struct btrfs_caching_control { struct list_head list; struct mutex mutex; wait_queue_head_t wait; struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ #define CACHING_CTL_WAKE_UP SZ_2M struct btrfs_io_ctl { void *cur, *orig; struct page *page; struct page **pages; struct btrfs_fs_info *fs_info; struct inode *inode; unsigned long size; int index; int num_pages; int entries; int bitmaps; unsigned check_crcs:1; }; /* * Tree to record all locked full stripes of a RAID5/6 block group */ struct btrfs_full_stripe_locks_tree { struct rb_root root; struct mutex lock; }; struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; struct btrfs_fs_info *fs_info; struct inode *inode; spinlock_t lock; u64 pinned; u64 reserved; u64 delalloc_bytes; u64 bytes_super; u64 flags; u64 cache_generation; /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. */ u32 bitmap_high_thresh; /* * If the free space extent count drops below this number, convert the * block group back to extents. */ u32 bitmap_low_thresh; /* * It is just used for the delayed data space allocation because * only the data space allocation and the relative metadata update * can be done cross the transaction. */ struct rw_semaphore data_rwsem; /* for raid56, this is a full stripe, without parity */ unsigned long full_stripe_len; unsigned int ro; unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; int disk_cache_state; /* cache tracking stuff */ int cached; struct btrfs_caching_control *caching_ctl; u64 last_byte_to_unpin; struct btrfs_space_info *space_info; /* free space cache stuff */ struct btrfs_free_space_ctl *free_space_ctl; /* block group cache stuff */ struct rb_node cache_node; /* for block groups in the same raid type */ struct list_head list; /* usage count */ atomic_t count; /* List of struct btrfs_free_clusters for this block group. * Today it will only have one thing on it, but that may change */ struct list_head cluster_list; /* For delayed block group creation or deletion of empty block groups */ struct list_head bg_list; /* For read-only block groups */ struct list_head ro_list; atomic_t trimming; /* For dirty block groups */ struct list_head dirty_list; struct list_head io_list; struct btrfs_io_ctl io_ctl; /* * Incremented when doing extent allocations and holding a read lock * on the space_info's groups_sem semaphore. * Decremented when an ordered extent that represents an IO against this * block group's range is created (after it's added to its inode's * root's list of ordered extents) or immediately after the allocation * if it's a metadata extent or fallocate extent (for these cases we * don't create ordered extents). */ atomic_t reservations; /* * Incremented while holding the spinlock *lock* by a task checking if * it can perform a nocow write (incremented if the value for the *ro* * field is 0). Decremented by such tasks once they create an ordered * extent or before that if some error happens before reaching that step. * This is to prevent races between block group relocation and nocow * writes through direct IO. */ atomic_t nocow_writers; /* Lock for free space tree operations. */ struct mutex free_space_lock; /* * Does the block group need to be added to the free space tree? * Protected by free_space_lock. */ int needs_free_space; /* Record locked full stripes for RAID5/6 block group */ struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; /* delayed seq elem */ struct seq_list { struct list_head list; u64 seq; }; #define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } #define SEQ_LAST ((u64)-1) enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, }; /* used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; spinlock_t lock; }; /* used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash_table { struct list_head stripe_cache; spinlock_t cache_lock; int cache_size; struct btrfs_stripe_hash table[]; }; #define BTRFS_STRIPE_HASH_TABLE_BITS 11 void btrfs_init_async_reclaim_work(struct work_struct *work); /* fs_info */ struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; struct btrfs_balance_control; struct btrfs_delayed_root; #define BTRFS_FS_BARRIER 1 #define BTRFS_FS_CLOSING_START 2 #define BTRFS_FS_CLOSING_DONE 3 #define BTRFS_FS_LOG_RECOVERING 4 #define BTRFS_FS_OPEN 5 #define BTRFS_FS_QUOTA_ENABLED 6 #define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 #define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 #define BTRFS_FS_QUOTA_OVERRIDE 14 /* Used to record internally whether fs has been frozen */ #define BTRFS_FS_FROZEN 15 /* * Indicate that a whole-filesystem exclusive operation is running * (device replace, resize, device add/delete, balance) */ #define BTRFS_FS_EXCL_OP 16 /* * To info transaction_kthread we need an immediate commit so it doesn't * need to wait for commit_interval */ #define BTRFS_FS_NEED_ASYNC_COMMIT 17 /* * Indicate that balance has been set up from the ioctl and is in the main * phase. The fs_info::balance_ctl is initialized. */ #define BTRFS_FS_BALANCE_RUNNING 18 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; struct btrfs_root *extent_root; struct btrfs_root *tree_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; struct btrfs_root *fs_root; struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *free_space_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; spinlock_t fs_roots_radix_lock; struct radix_tree_root fs_roots_radix; /* block group cache stuff */ spinlock_t block_group_cache_lock; u64 first_logical_byte; struct rb_root block_group_cache_tree; /* keep track of unallocated space */ atomic64_t free_chunk_space; struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; /* * block reservation for extent, checksum, root tree and * delayed dir index item */ struct btrfs_block_rsv global_block_rsv; /* block reservation for metadata operations */ struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; /* block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; struct btrfs_block_rsv empty_block_rsv; u64 generation; u64 last_trans_committed; u64 avg_delayed_ref_runtime; /* * this is updated to the current trans every time a full commit * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; unsigned long mount_opt; /* * Track requests for actions that need to be done during transaction * commit (like for some mount options). */ unsigned long pending_changes; unsigned long compress_type:4; unsigned int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a * wrong number because we will write out the data into a regular * extent. The write side(mount/remount) is under ->s_umount lock, * so it is also safe. */ u64 max_inline; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; /* * Used to protect the incompat_flags, compat_flags, compat_ro_flags * when they are updated. * * Because we do not clear the flags for ever, so we needn't use * the lock on the read side. * * We also needn't use the lock when we mount the fs, because * there is no other task which will update the flag. */ spinlock_t super_lock; struct btrfs_super_block *super_copy; struct btrfs_super_block *super_for_commit; struct super_block *sb; struct inode *btree_inode; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; /* * this is taken to make sure we don't set block groups ro after * the free space cache has been allocated on them */ struct mutex ro_block_group_mutex; /* this is used during read/modify/write to make sure * no two ios are trying to mod the same stripe at the same * time */ struct btrfs_stripe_hash_table *stripe_hash_table; /* * this protects the ordered operations list only while we are * processing all of the entries on it. This way we make * sure the commit code doesn't find the list temporarily empty * because another function happens to be doing non-waiting preflush * before jumping into the main commit. */ struct mutex ordered_operations_mutex; struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; spinlock_t trans_lock; /* * the reloc mutex goes with the trans lock, it is taken * during commit to protect us from the relocation code */ struct mutex reloc_mutex; struct list_head trans_list; struct list_head dead_roots; struct list_head caching_block_groups; spinlock_t delayed_iput_lock; struct list_head delayed_iputs; struct mutex cleaner_delayed_iput_mutex; atomic64_t tree_mod_seq; /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; struct list_head tree_mod_seq_list; atomic_t async_delalloc_pages; /* * this is used to protect the following list -- ordered_roots. */ spinlock_t ordered_root_lock; /* * all fs/file tree roots in which there are data=ordered extents * pending writeback are added into this list. * * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ struct list_head ordered_roots; struct mutex delalloc_root_mutex; spinlock_t delalloc_root_lock; /* all fs/file tree roots that have delalloc inodes. */ struct list_head delalloc_roots; /* * there is a pool of worker threads for checksumming during writes * and a pool for checksumming after reads. This is because readers * can run with FS locks held, and the writers may be waiting for * those locks. We don't want ordering in the pending list to cause * deadlocks, and so the two are serviced separately. * * A third pool does submit_bio to avoid deadlocking with the other * two */ struct btrfs_workqueue *workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; struct btrfs_workqueue *endio_repair_workers; struct btrfs_workqueue *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *submit_workers; struct btrfs_workqueue *caching_workers; struct btrfs_workqueue *readahead_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens * for the sys_munmap function call path */ struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; /* the extent workers do delayed refs on the extent allocation tree */ struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; u32 thread_pool_size; struct kobject *space_info_kobj; struct list_head pending_raid_kobjs; spinlock_t pending_raid_kobjs_lock; /* uncontended */ u64 total_pinned; /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; /* * The space_info list is effectively read only after initial * setup. It is populated at mount time and cleaned up after * all block groups are removed. RCU is used to protect it. */ struct list_head space_info; struct btrfs_space_info *data_sinfo; struct reloc_control *reloc_ctl; /* data_alloc_cluster is only used in ssd_spread mode */ struct btrfs_free_cluster data_alloc_cluster; /* all metadata allocations go through this cluster */ struct btrfs_free_cluster meta_alloc_cluster; /* auto defrag inodes go here */ spinlock_t defrag_inodes_lock; struct rb_root defrag_inodes; atomic_t defrag_running; /* Used to protect avail_{data, metadata, system}_alloc_bits */ seqlock_t profiles_lock; /* * these three are in extended format (availability of single * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; /* restriper state */ spinlock_t balance_lock; struct mutex balance_mutex; atomic_t balance_pause_req; atomic_t balance_cancel_req; struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; u32 data_chunk_allocations; u32 metadata_ratio; void *bdev_holder; /* private scrub information */ struct mutex scrub_lock; atomic_t scrubs_running; atomic_t scrub_pause_req; atomic_t scrubs_paused; atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; int scrub_workers_refcnt; struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_nocow_workers; struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; #endif /* is qgroup tracking in a consistent state? */ u64 qgroup_flags; /* holds configuration and tracking. Protected by qgroup_lock */ struct rb_root qgroup_tree; struct rb_root qgroup_op_tree; spinlock_t qgroup_lock; spinlock_t qgroup_op_lock; atomic_t qgroup_op_seq; /* * used to avoid frequently calling ulist_alloc()/ulist_free() * when doing qgroup accounting, it must be protected by qgroup_lock. */ struct ulist *qgroup_ulist; /* protect user change for quota operations */ struct mutex qgroup_ioctl_lock; /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; /* used by qgroup for an efficient tree traversal */ u64 qgroup_seq; /* qgroup rescan items */ struct mutex qgroup_rescan_lock; /* protects the progress item */ struct btrfs_key qgroup_rescan_progress; struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ /* filesystem state */ unsigned long fs_state; struct btrfs_delayed_root *delayed_root; /* readahead tree */ spinlock_t reada_lock; struct radix_tree_root reada_tree; /* readahead works cnt */ atomic_t reada_works_cnt; /* Extent buffer radix tree */ spinlock_t buffer_lock; struct radix_tree_root buffer_radix; /* next backup root to be overwritten */ int backup_root_index; /* device replace state */ struct btrfs_dev_replace dev_replace; struct percpu_counter bio_counter; wait_queue_head_t replace_wait; struct semaphore uuid_tree_rescan_sem; /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; struct mutex delete_unused_bgs_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; /* * Chunks that can't be freed yet (under a trim/discard operation) * and will be latter freed. Protected by fs_info->chunk_mutex. */ struct list_head pinned_chunks; /* Cached block sizes */ u32 nodesize; u32 sectorsize; u32 stripesize; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; #endif }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; } struct btrfs_subvolume_writers { struct percpu_counter counter; wait_queue_head_t wait; }; /* * The state of btrfs root */ /* * btrfs_record_root_in_trans is a multi-step process, * and it can race with the balancing code. But the * race is very small, and only the first time the root * is added to each transaction. So IN_TRANS_SETUP * is used to tell us when more checks are required */ #define BTRFS_ROOT_IN_TRANS_SETUP 0 #define BTRFS_ROOT_REF_COWS 1 #define BTRFS_ROOT_TRACK_DIRTY 2 #define BTRFS_ROOT_IN_RADIX 3 #define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 #define BTRFS_ROOT_DEFRAG_RUNNING 5 #define BTRFS_ROOT_FORCE_COW 6 #define BTRFS_ROOT_MULTI_LOG_TASKS 7 #define BTRFS_ROOT_DIRTY 8 /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ struct btrfs_root { struct extent_buffer *node; struct extent_buffer *commit_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; struct mutex objectid_mutex; spinlock_t accounting_lock; struct btrfs_block_rsv *block_rsv; /* free ino cache stuff */ struct btrfs_free_space_ctl *free_ino_ctl; enum btrfs_caching_type ino_cache_state; spinlock_t ino_cache_lock; wait_queue_head_t ino_cache_wait; struct btrfs_free_space_ctl *free_ino_pinned; u64 ino_cache_progress; struct inode *ino_cache_inode; struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; /* Just be updated when the commit succeeds. */ int last_log_commit; pid_t log_start_pid; u64 objectid; u64 last_trans; u32 type; u64 highest_objectid; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ u64 alloc_bytenr; #endif u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; struct list_head root_list; spinlock_t log_extents_lock[2]; struct list_head logged_list[2]; int orphan_cleanup_state; spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; /* * radix tree that keeps track of delayed nodes of every inode, * protected by inode_lock */ struct radix_tree_root delayed_nodes_tree; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later */ dev_t anon_dev; spinlock_t root_item_lock; refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered * extents waiting to finish IO. */ struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; struct mutex ordered_extent_mutex; /* * this is used by the balancing code to wait for all the pending * ordered extents */ spinlock_t ordered_extent_lock; /* * all of the data=ordered extents pending writeback * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ struct list_head ordered_extents; struct list_head ordered_root; u64 nr_ordered_extents; /* * Number of currently running SEND ioctls to prevent * manipulation with the read-only status via SUBVOL_SETFLAGS */ int send_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshotted; atomic_t snapshot_force_cow; /* For qgroup metadata reserved space */ spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; }; struct btrfs_file_private { void *filldir_buf; }; static inline u32 btrfs_inode_sectorsize(const struct inode *inode) { return btrfs_sb(inode->i_sb)->sectorsize; } static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { return info->nodesize - sizeof(struct btrfs_header); } #define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); } static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } #define BTRFS_FILE_EXTENT_INLINE_DATA_START \ (offsetof(struct btrfs_file_extent_item, disk_bytenr)) static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } /* * Flags for mount options. * * Note: don't forget to add new options to btrfs_show_options() */ #define BTRFS_MOUNT_NODATASUM (1 << 0) #define BTRFS_MOUNT_NODATACOW (1 << 1) #define BTRFS_MOUNT_NOBARRIER (1 << 2) #define BTRFS_MOUNT_SSD (1 << 3) #define BTRFS_MOUNT_DEGRADED (1 << 4) #define BTRFS_MOUNT_COMPRESS (1 << 5) #define BTRFS_MOUNT_NOTREELOG (1 << 6) #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) #define BTRFS_MOUNT_DISCARD (1 << 10) #define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) #define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) #define btrfs_set_and_info(fs_info, opt, fmt, args...) \ { \ if (!btrfs_test_opt(fs_info, opt)) \ btrfs_info(fs_info, fmt, ##args); \ btrfs_set_opt(fs_info->mount_opt, opt); \ } #define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ { \ if (btrfs_test_opt(fs_info, opt)) \ btrfs_info(fs_info, fmt, ##args); \ btrfs_clear_opt(fs_info->mount_opt, opt); \ } #ifdef CONFIG_BTRFS_DEBUG static inline int btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || (btrfs_test_opt(fs_info, FRAGMENT_DATA) && block_group->flags & BTRFS_BLOCK_GROUP_DATA); } #endif /* * Requests for changes that need to be done during transaction commit. * * Internal mount options that are used for special handling of the real * mount options (eg. cannot be set during remount and have to be set during * transaction commit) */ #define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) #define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) #define BTRFS_PENDING_COMMIT (2) #define btrfs_test_pending(info, opt) \ test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) #define btrfs_set_pending(info, opt) \ set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) #define btrfs_clear_pending(info, opt) \ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) /* * Helpers for setting pending mount option changes. * * Expects corresponding macros * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name */ #define btrfs_set_pending_and_info(info, opt, fmt, args...) \ do { \ if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ btrfs_info((info), fmt, ##args); \ btrfs_set_pending((info), SET_##opt); \ btrfs_clear_pending((info), CLEAR_##opt); \ } \ } while(0) #define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ do { \ if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ btrfs_info((info), fmt, ##args); \ btrfs_set_pending((info), CLEAR_##opt); \ btrfs_clear_pending((info), SET_##opt); \ } \ } while(0) /* * Inode flags */ #define BTRFS_INODE_NODATASUM (1 << 0) #define BTRFS_INODE_NODATACOW (1 << 1) #define BTRFS_INODE_READONLY (1 << 2) #define BTRFS_INODE_NOCOMPRESS (1 << 3) #define BTRFS_INODE_PREALLOC (1 << 4) #define BTRFS_INODE_SYNC (1 << 5) #define BTRFS_INODE_IMMUTABLE (1 << 6) #define BTRFS_INODE_APPEND (1 << 7) #define BTRFS_INODE_NODUMP (1 << 8) #define BTRFS_INODE_NOATIME (1 << 9) #define BTRFS_INODE_DIRSYNC (1 << 10) #define BTRFS_INODE_COMPRESS (1 << 11) #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) #define BTRFS_INODE_FLAG_MASK \ (BTRFS_INODE_NODATASUM | \ BTRFS_INODE_NODATACOW | \ BTRFS_INODE_READONLY | \ BTRFS_INODE_NOCOMPRESS | \ BTRFS_INODE_PREALLOC | \ BTRFS_INODE_SYNC | \ BTRFS_INODE_IMMUTABLE | \ BTRFS_INODE_APPEND | \ BTRFS_INODE_NODUMP | \ BTRFS_INODE_NOATIME | \ BTRFS_INODE_DIRSYNC | \ BTRFS_INODE_COMPRESS | \ BTRFS_INODE_ROOT_ITEM_INIT) struct btrfs_map_token { const struct extent_buffer *eb; char *kaddr; unsigned long offset; }; #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sb->s_blocksize_bits) static inline void btrfs_init_map_token (struct btrfs_map_token *token) { token->kaddr = NULL; } /* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: */ #define le8_to_cpu(v) (v) #define cpu_to_le8(v) (v) #define __le8 u8 #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof(((type *)0)->member))) #define write_eb_member(eb, ptr, type, member, result) (\ write_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ sizeof(((type *)0)->member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off, \ struct btrfs_map_token *token); \ void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \ unsigned long off, u##bits val, \ struct btrfs_map_token *token); \ static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, \ unsigned long off) \ { \ return btrfs_get_token_##bits(eb, ptr, off, NULL); \ } \ static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\ unsigned long off, u##bits val) \ { \ btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ } DECLARE_BTRFS_SETGET_BITS(8) DECLARE_BTRFS_SETGET_BITS(16) DECLARE_BTRFS_SETGET_BITS(32) DECLARE_BTRFS_SETGET_BITS(64) #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \ u##bits val) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\ const type *s, \ struct btrfs_map_token *token) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \ } \ static inline void btrfs_set_token_##name(struct extent_buffer *eb, \ type *s, u##bits val, \ struct btrfs_map_token *token) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = page_address(eb->pages[0]); \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ static inline void btrfs_set_##name(struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]); \ p->member = cpu_to_le##bits(val); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ return le##bits##_to_cpu(s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ s->member = cpu_to_le##bits(val); \ } static inline u64 btrfs_device_total_bytes(struct extent_buffer *eb, struct btrfs_dev_item *s) { BUILD_BUG_ON(sizeof(u64) != sizeof(((struct btrfs_dev_item *)0))->total_bytes); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } static inline void btrfs_set_device_total_bytes(struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { BUILD_BUG_ON(sizeof(u64) != sizeof(((struct btrfs_dev_item *)0))->total_bytes); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, io_align, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, io_width, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, seek_speed, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, bandwidth, 8); BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, generation, 64); static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) { return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); } static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) { return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); } BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) { return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); } BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, sector_size, 32); BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) { unsigned long offset = (unsigned long)c; offset += offsetof(struct btrfs_chunk, stripe); offset += nr * sizeof(struct btrfs_stripe); return (struct btrfs_stripe *)offset; } static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) { return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); } static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); } static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); } /* struct btrfs_block_group_item */ BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, used, 64); BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, struct btrfs_block_group_item, chunk_objectid, 64); BTRFS_SETGET_FUNCS(disk_block_group_flags, struct btrfs_block_group_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); /* struct btrfs_free_space_info */ BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, extent_count, 32); BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); /* struct btrfs_inode_extref */ BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, parent_objectid, 64); BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, name_len, 16); BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); /* struct btrfs_inode_item */ BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, sequence, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, transid, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, block_group, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, chunk_objectid, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) { unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); return (unsigned long)dev + ptr; } BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); static inline void btrfs_tree_block_key(struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, struct btrfs_tree_block_info *item, struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); } BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, objectid, 64); BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, offset, 64); BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, offset, 64); static inline u32 btrfs_extent_inline_ref_size(int type) { if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY) return sizeof(struct btrfs_extent_inline_ref); if (type == BTRFS_SHARED_DATA_REF_KEY) return sizeof(struct btrfs_shared_data_ref) + sizeof(struct btrfs_extent_inline_ref); if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); return 0; } BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, generation, 64); BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, generation, 64); static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); } static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); } static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); } static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, int nr, u64 val) { unsigned long ptr; ptr = offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); } static inline unsigned long btrfs_node_key_ptr_offset(int nr) { return offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; } void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); static inline void btrfs_set_node_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr; ptr = btrfs_node_key_ptr_offset(nr); write_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } /* struct btrfs_item */ BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); static inline unsigned long btrfs_item_nr_offset(int nr) { return offsetof(struct btrfs_leaf, items) + sizeof(struct btrfs_item) * nr; } static inline struct btrfs_item *btrfs_item_nr(int nr) { return (struct btrfs_item *)btrfs_item_nr_offset(nr); } static inline u32 btrfs_item_end(const struct extent_buffer *eb, struct btrfs_item *item) { return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); } static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_end(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_offset(eb, btrfs_item_nr(nr)); } static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_size(eb, btrfs_item_nr(nr)); } static inline void btrfs_item_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } static inline void btrfs_set_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); /* * struct btrfs_root_ref */ BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); static inline void btrfs_dir_item_key(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_dir_item, location, key); } static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_dir_item, location, key); } BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, num_entries, 64); BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, num_bitmaps, 64); BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, generation, 64); static inline void btrfs_free_space_key(const struct extent_buffer *eb, const struct btrfs_free_space_header *h, struct btrfs_disk_key *key) { read_eb_member(eb, h, struct btrfs_free_space_header, location, key); } static inline void btrfs_set_free_space_key(struct extent_buffer *eb, struct btrfs_free_space_header *h, const struct btrfs_disk_key *key) { write_eb_member(eb, h, struct btrfs_free_space_header, location, key); } /* struct btrfs_disk_key */ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, const struct btrfs_disk_key *disk) { cpu->offset = le64_to_cpu(disk->offset); cpu->type = disk->type; cpu->objectid = le64_to_cpu(disk->objectid); } static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, const struct btrfs_key *cpu) { disk->offset = cpu_to_le64(cpu->offset); disk->type = cpu->type; disk->objectid = cpu_to_le64(cpu->objectid); } static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_node_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_item_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_key *key) { struct btrfs_disk_key disk_key; btrfs_dir_item_key(eb, item, &disk_key); btrfs_disk_key_to_cpu(key, &disk_key); } static inline u8 btrfs_key_type(const struct btrfs_key *key) { return key->type; } static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) { key->type = val; } /* struct btrfs_header */ BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) { return (btrfs_header_flags(eb) & flag) == flag; } static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags | flag); return (flags & flag) == flag; } static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) { u64 flags = btrfs_header_flags(eb); btrfs_set_header_flags(eb, flags & ~flag); return (flags & flag) == flag; } static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) { u64 flags = btrfs_header_flags(eb); return flags >> BTRFS_BACKREF_REV_SHIFT; } static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) { u64 flags = btrfs_header_flags(eb); flags &= ~BTRFS_BACKREF_REV_MASK; flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; btrfs_set_header_flags(eb, flags); } static inline unsigned long btrfs_header_fsid(void) { return offsetof(struct btrfs_header, fsid); } static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb) { return offsetof(struct btrfs_header, chunk_tree_uuid); } static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; } /* struct btrfs_root_item */ BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, generation_v2, 64); BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); static inline bool btrfs_root_readonly(const struct btrfs_root *root) { return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } static inline bool btrfs_root_dead(const struct btrfs_root *root) { return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, tree_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, tree_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, chunk_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, extent_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, extent_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, extent_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, fs_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, fs_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, fs_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, dev_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, dev_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, dev_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, csum_root, 64); BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, csum_root_gen, 64); BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, csum_root_level, 8); BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); static inline void btrfs_balance_data(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_set_balance_data(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_balance_meta(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_set_balance_meta(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_balance_sys(const struct extent_buffer *eb, const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_set_balance_sys(struct extent_buffer *eb, struct btrfs_balance_item *bi, const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, const struct btrfs_disk_balance_args *disk) { memset(cpu, 0, sizeof(*cpu)); cpu->profiles = le64_to_cpu(disk->profiles); cpu->usage = le64_to_cpu(disk->usage); cpu->devid = le64_to_cpu(disk->devid); cpu->pstart = le64_to_cpu(disk->pstart); cpu->pend = le64_to_cpu(disk->pend); cpu->vstart = le64_to_cpu(disk->vstart); cpu->vend = le64_to_cpu(disk->vend); cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); cpu->limit = le64_to_cpu(disk->limit); cpu->stripes_min = le32_to_cpu(disk->stripes_min); cpu->stripes_max = le32_to_cpu(disk->stripes_max); } static inline void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, const struct btrfs_balance_args *cpu) { memset(disk, 0, sizeof(*disk)); disk->profiles = cpu_to_le64(cpu->profiles); disk->usage = cpu_to_le64(cpu->usage); disk->devid = cpu_to_le64(cpu->devid); disk->pstart = cpu_to_le64(cpu->pstart); disk->pend = cpu_to_le64(cpu->pend); disk->vstart = cpu_to_le64(cpu->vstart); disk->vend = cpu_to_le64(cpu->vend); disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); disk->limit = cpu_to_le64(cpu->limit); disk->stripes_min = cpu_to_le32(cpu->stripes_min); disk->stripes_max = cpu_to_le32(cpu->stripes_max); } /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, struct btrfs_super_block, sys_chunk_array_size, 32); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, struct btrfs_super_block, chunk_root_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, chunk_root, 64); BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, log_root_transid, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, total_bytes, 64); BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, bytes_used, 64); BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, sectorsize, 32); BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, nodesize, 32); BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, stripesize, 32); BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, root_dir_objectid, 64); BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, num_devices, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, compat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, compat_ro_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, incompat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, cache_generation, 64); BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); /* * csum type is validated at mount time */ return btrfs_csum_sizes[t]; } /* * The leaf data grows from end-to-front in the node. * this returns the address of the start of the last item, * which is the stop of the leaf data stack */ static inline unsigned int leaf_data_end(const struct btrfs_fs_info *fs_info, const struct extent_buffer *leaf) { u32 nr = btrfs_header_nritems(leaf); if (nr == 0) return BTRFS_LEAF_DATA_SIZE(fs_info); return btrfs_item_offset_nr(leaf, nr - 1); } /* struct btrfs_file_extent_item */ BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, struct btrfs_file_extent_item, compression, 8); static inline unsigned long btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e) { return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; } static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) { return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; } BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, generation, 64); BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, disk_num_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, ram_bytes, 64); BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, compression, 8); BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, encryption, 8); BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); /* * this returns the number of bytes used by the item on disk, minus the * size of any extent headers. If a file is compressed on disk, this is * the compressed size */ static inline u32 btrfs_file_extent_inline_item_len( const struct extent_buffer *eb, struct btrfs_item *e) { return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } /* btrfs_dev_stats_item */ static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb, const struct btrfs_dev_stats_item *ptr, int index) { u64 val; read_extent_buffer(eb, &val, offsetof(struct btrfs_dev_stats_item, values) + ((unsigned long)ptr) + (index * sizeof(u64)), sizeof(val)); return val; } static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb, struct btrfs_dev_stats_item *ptr, int index, u64 val) { write_extent_buffer(eb, &val, offsetof(struct btrfs_dev_stats_item, values) + ((unsigned long)ptr) + (index * sizeof(u64)), sizeof(val)); } /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, version, 64); BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, generation, 64); BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, rfer_cmpr, 64); BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, excl_cmpr, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, struct btrfs_qgroup_info_item, generation, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, rfer_cmpr, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, excl_cmpr, 64); /* btrfs_qgroup_limit_item */ BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, max_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, max_excl, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, rsv_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); /* btrfs_dev_replace_item */ BTRFS_SETGET_FUNCS(dev_replace_src_devid, struct btrfs_dev_replace_item, src_devid, 64); BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 64); BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, replace_state, 64); BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, time_started, 64); BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, time_stopped, 64); BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, num_write_errors, 64); BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 64); BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, cursor_left, 64); BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, struct btrfs_dev_replace_item, src_devid, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, struct btrfs_dev_replace_item, replace_state, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, struct btrfs_dev_replace_item, time_started, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, struct btrfs_dev_replace_item, time_stopped, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, struct btrfs_dev_replace_item, num_write_errors, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, struct btrfs_dev_replace_item, num_uncorrectable_read_errors, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, struct btrfs_dev_replace_item, cursor_left, 64); BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) static inline u64 btrfs_name_hash(const char *name, int len) { return crc32c((u32)~1, name, len); } /* * Figure the key offset of an extended inode ref */ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) { return (u64) crc32c(parent_objectid, name, len); } static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); } static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); } /* extent-tree.c */ enum btrfs_inline_ref_type { BTRFS_REF_TYPE_INVALID = 0, BTRFS_REF_TYPE_BLOCK = 1, BTRFS_REF_TYPE_DATA = 2, BTRFS_REF_TYPE_ANY = 3, }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data); u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* * Doing a truncate won't result in new nodes or leaves, just what we need for * COW. */ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); void btrfs_get_block_group(struct btrfs_block_group_cache *cache); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_fs_info *info); int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info); u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info); u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); enum btrfs_reserve_flush_enum { /* If we are in the transaction, we can't flush anything.*/ BTRFS_RESERVE_NO_FLUSH, /* * Flushing delalloc may cause deadlock somewhere, in this * case, use FLUSH LIMIT */ BTRFS_RESERVE_FLUSH_LIMIT, BTRFS_RESERVE_FLUSH_ALL, }; enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, FLUSH_DELALLOC = 3, FLUSH_DELALLOC_WAIT = 4, ALLOC_CHUNK = 5, COMMIT_TRANS = 6, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); void btrfs_free_reserved_data_space(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, unsigned short type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, int update_size); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_start_write_no_snapshotting(struct btrfs_root *root); void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); u64 add_new_free_space(struct btrfs_block_group_cache *block_group, u64 start, u64 end); void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int level, int *slot); int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans); enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_NEW, BTRFS_COMPARE_TREE_DELETED, BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, void *ctx); int btrfs_compare_trees(struct btrfs_root *left_root, struct btrfs_root *right_root, btrfs_changed_cb_t cb, void *ctx); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset); int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *new_key); int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); int btrfs_search_slot_for_read(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int find_higher, int return_any); int btrfs_realloc_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *parent, int start_slot, u64 *last_ret, struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_clear_path_blocking(struct btrfs_path *p, struct extent_buffer *held, int held_rw); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); static inline int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) { return btrfs_del_items(trans, root, path, path->slots[0], 1); } void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, u32 total_data, u32 total_size, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, int nr); static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size) { return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { ++p->slots[0]; if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) return btrfs_next_old_leaf(root, p, time_seq); return 0; } static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* * Do it this way so we only ever do one test_bit in the normal case. */ if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) return 2; return 1; } return 0; } /* * If we remount the fs to be R/O or umount the fs, the cleaner needn't do * anything except sleeping. This function is used to check the status of * the fs. */ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) { return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info); } static inline void free_fs_info(struct btrfs_fs_info *fs_info) { kfree(fs_info->balance_ctl); kfree(fs_info->delayed_root); kfree(fs_info->extent_root); kfree(fs_info->tree_root); kfree(fs_info->chunk_root); kfree(fs_info->dev_root); kfree(fs_info->csum_root); kfree(fs_info->quota_root); kfree(fs_info->uuid_root); kfree(fs_info->free_space_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); security_free_mnt_opts(&fs_info->security_opts); kvfree(fs_info); } /* tree mod log functions from ctree.c */ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, int name_len); int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, const char *name, int name_len); int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_root_item *item); int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); void btrfs_check_and_init_root_item(struct btrfs_root_item *item); void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* uuid-tree.c */ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, int (*check_func)(struct btrfs_fs_info *, u8 *, u8, u64)); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, u64 objectid, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di); int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, const char *name, u16 name_len, const void *data, u16 data_len); struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* inode-item.c */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, const char *name, int name_len, struct btrfs_inode_ref **ref_ret); int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, u64 ref_objectid, const char *name, int name_len, struct btrfs_inode_extref **extref_ret); /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 disk_offset, u64 disk_num_bytes, u64 num_bytes, u64 offset, u64 ram_bytes, u8 compression, u8 encryption, u16 other_encoding); int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, const bool new_inline, struct extent_map *em); /* inode.c */ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const char *name, int name_len, int add_backref, u64 index); int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 new_size, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state, int dedupe); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *new, struct btrfs_path *path); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root, int *was_new); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end, int create); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); extern const struct dentry_operations btrfs_dentry_operations; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_inode_set_ops(struct inode *inode); #endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff, struct file *dst_file, loff_t dst_loff, u64 olen); /* file.c */ int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end, int skip_pinned); extern const struct file_operations btrfs_file_operations; int __btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, u64 start, u64 end, u64 *drop_end, int drop_cache, int replace_extent, u32 extent_item_size, int *key_inserted); int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); /* sysfs.c */ int __init btrfs_init_sysfs(void); void __cold btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } #ifdef CONFIG_PRINTK __printf(2, 3) __cold void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #else #define btrfs_printk(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) #endif #define btrfs_emerg(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_EMERG fmt, ##args) #define btrfs_alert(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_WARNING fmt, ##args) #define btrfs_notice(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) /* * Wrappers that use printk_in_rcu */ #define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) #define btrfs_alert_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) #define btrfs_notice_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) /* * Wrappers that use a ratelimited printk_in_rcu */ #define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) #define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) #define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) /* * Wrappers that use a ratelimited printk */ #define btrfs_emerg_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) #define btrfs_alert_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) #define btrfs_crit_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) #define btrfs_err_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) #define btrfs_warn_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) #define btrfs_notice_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) #define btrfs_info_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \ } while (0) #define btrfs_debug_in_rcu(fs_info, fmt, args...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \ } while (0) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \ ##args);\ } while (0) #define btrfs_debug_rl(fs_info, fmt, args...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \ ##args); \ } while (0) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_in_rcu(fs_info, fmt, args...) \ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) #else #define btrfs_debug(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_in_rcu(fs_info, fmt, args...) \ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) #endif #define btrfs_printk_in_rcu(fs_info, fmt, args...) \ do { \ rcu_read_lock(); \ btrfs_printk(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) #define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ do { \ rcu_read_lock(); \ btrfs_no_printk(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) #define btrfs_printk_ratelimited(fs_info, fmt, args...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ btrfs_printk(fs_info, fmt, ##args); \ } while (0) #define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ do { \ rcu_read_lock(); \ btrfs_printk_ratelimited(fs_info, fmt, ##args); \ rcu_read_unlock(); \ } while (0) #ifdef CONFIG_BTRFS_ASSERT __cold static inline void assfail(const char *expr, const char *file, int line) { pr_err("assertion failed: %s, file: %s, line: %d\n", expr, file, line); BUG(); } #define ASSERT(expr) \ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #else #define ASSERT(expr) ((void)0) #endif __cold static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { btrfs_err(fs_info, "Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); } __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); const char *btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, unsigned int line, int errno); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ #define btrfs_abort_transaction(trans, errno) \ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ if ((errno) != -EIO) { \ WARN(1, KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ } else { \ btrfs_debug((trans)->fs_info, \ "Transaction aborted (error %d)", \ (errno)); \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ __LINE__, (errno)); \ } while (0) #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ do { \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args); \ } while (0) __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic * will panic(). Otherwise we BUG() here. */ #define btrfs_panic(fs_info, errno, fmt, args...) \ do { \ __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ BUG(); \ } while (0) /* compatibility and incompatibility defines */ #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; u64 features; disk_super = fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & flag)) { spin_lock(&fs_info->super_lock); features = btrfs_super_incompat_flags(disk_super); if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); btrfs_info(fs_info, "setting %llu feature flag", flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_incompat(__fs_info, opt) \ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; u64 features; disk_super = fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (features & flag) { spin_lock(&fs_info->super_lock); features = btrfs_super_incompat_flags(disk_super); if (features & flag) { features &= ~flag; btrfs_set_super_incompat_flags(disk_super, features); btrfs_info(fs_info, "clearing %llu feature flag", flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_fs_incompat(fs_info, opt) \ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; disk_super = fs_info->super_copy; return !!(btrfs_super_incompat_flags(disk_super) & flag); } #define btrfs_set_fs_compat_ro(__fs_info, opt) \ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; u64 features; disk_super = fs_info->super_copy; features = btrfs_super_compat_ro_flags(disk_super); if (!(features & flag)) { spin_lock(&fs_info->super_lock); features = btrfs_super_compat_ro_flags(disk_super); if (!(features & flag)) { features |= flag; btrfs_set_super_compat_ro_flags(disk_super, features); btrfs_info(fs_info, "setting %llu ro feature flag", flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; u64 features; disk_super = fs_info->super_copy; features = btrfs_super_compat_ro_flags(disk_super); if (features & flag) { spin_lock(&fs_info->super_lock); features = btrfs_super_compat_ro_flags(disk_super); if (features & flag) { features &= ~flag; btrfs_set_super_compat_ro_flags(disk_super, features); btrfs_info(fs_info, "clearing %llu ro feature flag", flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_fs_compat_ro(fs_info, opt) \ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; disk_super = fs_info->super_copy; return !!(btrfs_super_compat_ro_flags(disk_super) & flag); } /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); #else #define btrfs_get_acl NULL #define btrfs_set_acl NULL static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir) { return 0; } #endif /* relocation.c */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); static inline void btrfs_init_full_stripe_locks_tree( struct btrfs_full_stripe_locks_tree *locks_root) { locks_root->root = RB_ROOT; mutex_init(&locks_root->lock); } /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) { btrfs_bio_counter_sub(fs_info, 1); } /* reada.c */ struct reada_control { struct btrfs_fs_info *fs_info; /* tree to prefetch */ struct btrfs_key key_start; struct btrfs_key key_end; /* exclusive */ atomic_t elems; struct kref refcnt; wait_queue_head_t wait; }; struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct extent_buffer *eb, int err); static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && !btrfs_qgroup_level(rootid))) return 1; return 0; } static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) { return signal_pending(current); } /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); #endif static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state))) return 1; #endif return 0; } static inline void cond_wake_up(struct wait_queue_head *wq) { /* * This implies a full smp_mb barrier, see comments for * waitqueue_active why. */ if (wq_has_sleeper(wq)) wake_up(wq); } static inline void cond_wake_up_nomb(struct wait_queue_head *wq) { /* * Special case for conditional wakeup where the barrier required for * waitqueue_active is implied by some of the preceding code. Eg. one * of such atomic operations (atomic_dec_and_return, ...), or a * unlock/lock sequence, etc. */ if (waitqueue_active(wq)) wake_up(wq); } #endif |
109 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | /* * Hash algorithms. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #ifndef _CRYPTO_INTERNAL_HASH_H #define _CRYPTO_INTERNAL_HASH_H #include <crypto/algapi.h> #include <crypto/hash.h> struct ahash_request; struct scatterlist; struct crypto_hash_walk { char *data; unsigned int offset; unsigned int alignmask; struct page *pg; unsigned int entrylen; unsigned int total; struct scatterlist *sg; unsigned int flags; }; struct ahash_instance { struct ahash_alg alg; }; struct shash_instance { struct shash_alg alg; }; struct crypto_ahash_spawn { struct crypto_spawn base; }; struct crypto_shash_spawn { struct crypto_spawn base; }; extern const struct crypto_type crypto_ahash_type; int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err); int crypto_hash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk); int crypto_ahash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk); static inline int crypto_ahash_walk_done(struct crypto_hash_walk *walk, int err) { return crypto_hash_walk_done(walk, err); } static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk) { return !(walk->entrylen | walk->total); } static inline int crypto_ahash_walk_last(struct crypto_hash_walk *walk) { return crypto_hash_walk_last(walk); } int crypto_register_ahash(struct ahash_alg *alg); int crypto_unregister_ahash(struct ahash_alg *alg); int crypto_register_ahashes(struct ahash_alg *algs, int count); void crypto_unregister_ahashes(struct ahash_alg *algs, int count); int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst); void ahash_free_instance(struct crypto_instance *inst); bool crypto_shash_alg_has_setkey(struct shash_alg *alg); bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg); int crypto_init_ahash_spawn(struct crypto_ahash_spawn *spawn, struct hash_alg_common *alg, struct crypto_instance *inst); static inline void crypto_drop_ahash(struct crypto_ahash_spawn *spawn) { crypto_drop_spawn(&spawn->base); } struct hash_alg_common *ahash_attr_alg(struct rtattr *rta, u32 type, u32 mask); int crypto_register_shash(struct shash_alg *alg); int crypto_unregister_shash(struct shash_alg *alg); int crypto_register_shashes(struct shash_alg *algs, int count); int crypto_unregister_shashes(struct shash_alg *algs, int count); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst); void shash_free_instance(struct crypto_instance *inst); int crypto_init_shash_spawn(struct crypto_shash_spawn *spawn, struct shash_alg *alg, struct crypto_instance *inst); static inline void crypto_drop_shash(struct crypto_shash_spawn *spawn) { crypto_drop_spawn(&spawn->base); } struct shash_alg *shash_attr_alg(struct rtattr *rta, u32 type, u32 mask); int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc); int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc); int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc); int crypto_init_shash_ops_async(struct crypto_tfm *tfm); static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm) { return crypto_tfm_ctx(crypto_ahash_tfm(tfm)); } static inline struct ahash_alg *__crypto_ahash_alg(struct crypto_alg *alg) { return container_of(__crypto_hash_alg_common(alg), struct ahash_alg, halg); } static inline void crypto_ahash_set_reqsize(struct crypto_ahash *tfm, unsigned int reqsize) { tfm->reqsize = reqsize; } static inline struct crypto_instance *ahash_crypto_instance( struct ahash_instance *inst) { return container_of(&inst->alg.halg.base, struct crypto_instance, alg); } static inline struct ahash_instance *ahash_instance( struct crypto_instance *inst) { return container_of(&inst->alg, struct ahash_instance, alg.halg.base); } static inline void *ahash_instance_ctx(struct ahash_instance *inst) { return crypto_instance_ctx(ahash_crypto_instance(inst)); } static inline unsigned int ahash_instance_headroom(void) { return sizeof(struct ahash_alg) - sizeof(struct crypto_alg); } static inline struct ahash_instance *ahash_alloc_instance( const char *name, struct crypto_alg *alg) { return crypto_alloc_instance2(name, alg, ahash_instance_headroom()); } static inline void ahash_request_complete(struct ahash_request *req, int err) { req->base.complete(&req->base, err); } static inline u32 ahash_request_flags(struct ahash_request *req) { return req->base.flags; } static inline struct crypto_ahash *crypto_spawn_ahash( struct crypto_ahash_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline int ahash_enqueue_request(struct crypto_queue *queue, struct ahash_request *request) { return crypto_enqueue_request(queue, &request->base); } static inline struct ahash_request *ahash_dequeue_request( struct crypto_queue *queue) { return ahash_request_cast(crypto_dequeue_request(queue)); } static inline int ahash_tfm_in_queue(struct crypto_queue *queue, struct crypto_ahash *tfm) { return crypto_tfm_in_queue(queue, crypto_ahash_tfm(tfm)); } static inline void *crypto_shash_ctx(struct crypto_shash *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline struct crypto_instance *shash_crypto_instance( struct shash_instance *inst) { return container_of(&inst->alg.base, struct crypto_instance, alg); } static inline struct shash_instance *shash_instance( struct crypto_instance *inst) { return container_of(__crypto_shash_alg(&inst->alg), struct shash_instance, alg); } static inline void *shash_instance_ctx(struct shash_instance *inst) { return crypto_instance_ctx(shash_crypto_instance(inst)); } static inline struct shash_instance *shash_alloc_instance( const char *name, struct crypto_alg *alg) { return crypto_alloc_instance2(name, alg, sizeof(struct shash_alg) - sizeof(*alg)); } static inline struct crypto_shash *crypto_spawn_shash( struct crypto_shash_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline void *crypto_shash_ctx_aligned(struct crypto_shash *tfm) { return crypto_tfm_ctx_aligned(&tfm->base); } static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_shash, base); } #endif /* _CRYPTO_INTERNAL_HASH_H */ |
7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/power/process.c - Functions for starting/stopping processes on * suspend transitions. * * Originally from swsusp. */ #undef DEBUG #include <linux/interrupt.h> #include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> #include <linux/workqueue.h> #include <linux/kmod.h> #include <trace/events/power.h> #include <linux/cpuset.h> /* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { struct task_struct *g, *p; unsigned long end_time; unsigned int todo; bool wq_busy = false; ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); if (!user_only) freeze_workqueues_begin(); while (true) { todo = 0; read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; } read_unlock(&tasklist_lock); if (!user_only) { wq_busy = freeze_workqueues_busy(); todo += wq_busy; } if (!todo || time_after(jiffies, end_time)) break; if (pm_wakeup_pending()) { wakeup = true; break; } /* * We need to retry, but first give the freezing tasks some * time to enter the refrigerator. Start with an initial * 1 ms sleep followed by exponential backoff until 8 ms. */ usleep_range(sleep_usecs / 2, sleep_usecs); if (sleep_usecs < 8 * USEC_PER_MSEC) sleep_usecs *= 2; } end = ktime_get_boottime(); elapsed = ktime_sub(end, start); elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_cont("\n"); pr_err("Freezing of tasks %s after %d.%03d seconds " "(%d tasks refusing to freeze, wq_busy=%d):\n", wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); if (wq_busy) show_workqueue_state(); if (!wakeup) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); } } else { pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; } /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls * freeze_processes must later call thaw_processes. * * On success, returns 0. On failure, -errno and system is fully thawed. */ int freeze_processes(void) { int error; error = __usermodehelper_disable(UMH_FREEZING); if (error) return error; /* Make sure this task doesn't get frozen */ current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) atomic_inc(&system_freezing_cnt); pm_wakeup_clear(true); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); if (!error) { __usermodehelper_set_disable_depth(UMH_DISABLED); pr_cont("done."); } pr_cont("\n"); BUG_ON(in_atomic()); /* * Now that the whole userspace is frozen we need to disbale * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. */ if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; if (error) thaw_processes(); return error; } /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * * On success, returns 0. On failure, -errno and only the kernel threads are * thawed, so as to give a chance to the caller to do additional cleanups * (if any) before thawing the userspace tasks. So, it is the responsibility * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { int error; pr_info("Freezing remaining freezable tasks ... "); pm_nosig_freezing = true; error = try_to_freeze_tasks(false); if (!error) pr_cont("done."); pr_cont("\n"); BUG_ON(in_atomic()); if (error) thaw_kernel_threads(); return error; } void thaw_processes(void) { struct task_struct *g, *p; struct task_struct *curr = current; trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) atomic_dec(&system_freezing_cnt); pm_freezing = false; pm_nosig_freezing = false; oom_killer_enable(); pr_info("Restarting tasks ... "); __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); cpuset_wait_for_hotplug(); read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); curr->flags &= ~PF_SUSPEND_TASK; usermodehelper_enable(); schedule(); pr_cont("done.\n"); trace_suspend_resume(TPS("thaw_processes"), 0, false); } void thaw_kernel_threads(void) { struct task_struct *g, *p; pm_nosig_freezing = false; pr_info("Restarting kernel threads ... "); thaw_workqueues(); read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); } read_unlock(&tasklist_lock); schedule(); pr_cont("done.\n"); } |
52 1604 1605 174 174 718 42 430 13 27 14 4 49 1029 45 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 | /* * include/linux/hrtimer.h * * hrtimers - High-resolution kernel timers * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_HRTIMER_H #define _LINUX_HRTIMER_H #include <linux/rbtree.h> #include <linux/ktime.h> #include <linux/init.h> #include <linux/list.h> #include <linux/percpu.h> #include <linux/timer.h> #include <linux/timerqueue.h> struct hrtimer_clock_base; struct hrtimer_cpu_base; /* * Mode arguments of xxx_hrtimer functions: * * HRTIMER_MODE_ABS - Time value is absolute * HRTIMER_MODE_REL - Time value is relative to now * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered * when starting the timer) * HRTIMER_MODE_SOFT - Timer callback function will be executed in * soft irq context */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, HRTIMER_MODE_REL = 0x01, HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT, }; /* * Return values for the callback function */ enum hrtimer_restart { HRTIMER_NORESTART, /* Timer is not restarted */ HRTIMER_RESTART, /* Timer must be restarted */ }; /* * Values to track state of the timer * * Possible states: * * 0x00 inactive * 0x01 enqueued into rbtree * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free * the timer from the callback function. * * Therefore we track the callback state in: * * timer->base->cpu_base->running == timer * * On SMP it is possible to have a "callback function running and enqueued" * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 /** * struct hrtimer - the basic hrtimer structure * @node: timerqueue node, which also manages node.expires, * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding * slack to the _softexpires value. For non range timers * identical to _softexpires. * @_softexpires: the absolute earliest expiry time of the hrtimer. * The time which was given as expiry time when the timer * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) * @is_rel: Set if the timer was armed relative * @is_soft: Set if hrtimer will be expired in soft interrupt context. * * The hrtimer structure must be initialized by hrtimer_init() */ struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; u8 is_rel; u8 is_soft; }; /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure * @task: task to wake up * * task is set to NULL, when the timer expires. */ struct hrtimer_sleeper { struct hrtimer timer; struct task_struct *task; }; #ifdef CONFIG_64BIT # define __hrtimer_clock_base_align ____cacheline_aligned #else # define __hrtimer_clock_base_align #endif /** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base * @index: clock type index for per_cpu support when moving a * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; unsigned int index; clockid_t clockid; seqcount_t seq; struct hrtimer *running; struct timerqueue_head active; ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, HRTIMER_BASE_MONOTONIC_SOFT, HRTIMER_BASE_REALTIME_SOFT, HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, HRTIMER_MAX_CLOCK_BASES, }; /** * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @hres_active: State of high resolution mode * @in_hrtirq: hrtimer_interrupt() is currently executing * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. * @nr_events: Total number of hrtimer interrupt events * @nr_retries: Total number of hrtimer interrupt retries * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @expires_next: absolute time of the next event, is required for remote * hrtimer enqueue; it is the total first expiry time (hard * and soft hrtimer are taken into account) * @next_timer: Pointer to the first expiring timer * @softirq_expires_next: Time to check, if soft queues needs also to be expired * @softirq_next_timer: Pointer to the first expiring softirq based timer * @clock_base: array of clock bases for this cpu * * Note: next_timer is just an optimization for __remove_hrtimer(). * Do not dereference the pointer because it is not reliable on * cross cpu removals. */ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; unsigned int hres_active : 1, in_hrtirq : 1, hang_detected : 1, softirq_activated : 1; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; unsigned short nr_hangs; unsigned int max_hang_time; #endif ktime_t expires_next; struct hrtimer *next_timer; ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; } ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = time; timer->_softexpires = time; } static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, delta); } static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); } static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) { timer->node.expires = tv64; timer->_softexpires = tv64; } static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = ktime_add_safe(timer->node.expires, time); timer->_softexpires = ktime_add_safe(timer->_softexpires, time); } static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) { timer->node.expires = ktime_add_ns(timer->node.expires, ns); timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); } static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) { return timer->node.expires; } static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) { return timer->node.expires; } static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) { return ktime_to_ns(timer->node.expires); } static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { return ktime_sub(timer->node.expires, timer->base->get_time()); } static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) { return timer->base->get_time(); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? timer->base->cpu_base->hres_active : 0; } #ifdef CONFIG_HIGH_RES_TIMERS struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); /* * The resolution of the clocks. The resolution value is returned in * the clock_getres() system call to give application programmers an * idea of the (in)accuracy of timers. Timer values are rounded up to * this resolution values. */ # define HIGH_RES_NSEC 1 # define KTIME_HIGH_RES (HIGH_RES_NSEC) # define MONOTONIC_RES_NSEC HIGH_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_HIGH_RES extern void clock_was_set_delayed(void); extern unsigned int hrtimer_resolution; #else # define MONOTONIC_RES_NSEC LOW_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_LOW_RES #define hrtimer_resolution (unsigned int)LOW_RES_NSEC static inline void clock_was_set_delayed(void) { } #endif static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) { ktime_t rem = ktime_sub(timer->node.expires, now); /* * Adjust relative timers for the extra we added in * hrtimer_start_range_ns() to prevent short timeouts. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) rem -= hrtimer_resolution; return rem; } static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { return __hrtimer_expires_remaining_adjusted(timer, timer->base->get_time()); } extern void clock_was_set(void); #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); #else static inline void timerfd_clock_was_set(void) { } #endif extern void hrtimers_resume(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); /* Exported timer functions: */ /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void destroy_hrtimer_on_stack(struct hrtimer *timer); #else static inline void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode) { hrtimer_init(timer, which_clock, mode); } static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { hrtimer_start_range_ns(timer, tim, 0, mode); } extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); hrtimer_start_range_ns(timer, soft, delta, mode); } static inline void hrtimer_restart(struct hrtimer *timer) { hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { return __hrtimer_get_remaining(timer, false); } extern u64 hrtimer_get_next_event(void); extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); /** * hrtimer_is_queued = check, whether the timer is on one of the queues * @timer: Timer to check * * Returns: True if the timer is queued, false otherwise * * The function can be used lockless, but it gives only a current snapshot. */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { /* The READ_ONCE pairs with the update functions of timer->state */ return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); } /* * Helper function to check, whether the timer is running the callback * function */ static inline int hrtimer_callback_running(struct hrtimer *timer) { return timer->base->running == timer; } /* Forward a hrtimer so it expires after now: */ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); /** * hrtimer_forward_now - forward the timer expiry so it expires after now * @timer: hrtimer to forward * @interval: the interval to forward * * Forward the timer expiry so it will expire after the current time * of the hrtimer clock base. Returns the number of overruns. * * Can be safely called from the callback function of @timer. If * called from other contexts @timer must neither be enqueued nor * running the callback and the caller needs to take care of * serialization. * * Note: This only updates the timer expiry value and does not requeue * the timer. */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { return hrtimer_forward(timer, timer->base->get_time(), interval); } /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); extern long hrtimer_nanosleep(const struct timespec64 *rqtp, const enum hrtimer_mode mode, const clockid_t clockid); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *tsk); extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); /* Show pending timers: */ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_dead_cpu(unsigned int cpu); #else #define hrtimers_dead_cpu NULL #endif #endif |
2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | /* * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia * * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * */ #include <asm/crypto/camellia.h> #include <asm/crypto/glue_helper.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/xts.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 /* 32-way AVX2/AES-NI parallel cipher functions */ asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst, const u8 *src); asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst, const u8 *src, le128 *iv); static const struct common_glue_ctx camellia_enc = { .num_funcs = 4, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } }, { .num_blocks = 2, .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } }, { .num_blocks = 1, .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } } } }; static const struct common_glue_ctx camellia_ctr = { .num_funcs = 4, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } }, { .num_blocks = 2, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } }, { .num_blocks = 1, .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } } } }; static const struct common_glue_ctx camellia_enc_xts = { .num_funcs = 3, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } }, { .num_blocks = 1, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } } } }; static const struct common_glue_ctx camellia_dec = { .num_funcs = 4, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } }, { .num_blocks = 2, .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } }, { .num_blocks = 1, .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } } } }; static const struct common_glue_ctx camellia_dec_cbc = { .num_funcs = 4, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } }, { .num_blocks = 2, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } }, { .num_blocks = 1, .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } } } }; static const struct common_glue_ctx camellia_dec_xts = { .num_funcs = 3, .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, .funcs = { { .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) } }, { .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } }, { .num_blocks = 1, .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } } } }; static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen, &tfm->base.crt_flags); } static int ecb_encrypt(struct skcipher_request *req) { return glue_ecb_req_128bit(&camellia_enc, req); } static int ecb_decrypt(struct skcipher_request *req) { return glue_ecb_req_128bit(&camellia_dec, req); } static int cbc_encrypt(struct skcipher_request *req) { return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk), req); } static int cbc_decrypt(struct skcipher_request *req) { return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req); } static int ctr_crypt(struct skcipher_request *req) { return glue_ctr_req_128bit(&camellia_ctr, req); } static int xts_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&camellia_enc_xts, req, XTS_TWEAK_CAST(camellia_enc_blk), &ctx->tweak_ctx, &ctx->crypt_ctx); } static int xts_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm); return glue_xts_req_128bit(&camellia_dec_xts, req, XTS_TWEAK_CAST(camellia_enc_blk), &ctx->tweak_ctx, &ctx->crypt_ctx); } static struct skcipher_alg camellia_algs[] = { { .base.cra_name = "__ecb(camellia)", .base.cra_driver_name = "__ecb-camellia-aesni-avx2", .base.cra_priority = 500, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAMELLIA_MIN_KEY_SIZE, .max_keysize = CAMELLIA_MAX_KEY_SIZE, .setkey = camellia_setkey, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "__cbc(camellia)", .base.cra_driver_name = "__cbc-camellia-aesni-avx2", .base.cra_priority = 500, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAMELLIA_MIN_KEY_SIZE, .max_keysize = CAMELLIA_MAX_KEY_SIZE, .ivsize = CAMELLIA_BLOCK_SIZE, .setkey = camellia_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, { .base.cra_name = "__ctr(camellia)", .base.cra_driver_name = "__ctr-camellia-aesni-avx2", .base.cra_priority = 500, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAMELLIA_MIN_KEY_SIZE, .max_keysize = CAMELLIA_MAX_KEY_SIZE, .ivsize = CAMELLIA_BLOCK_SIZE, .chunksize = CAMELLIA_BLOCK_SIZE, .setkey = camellia_setkey, .encrypt = ctr_crypt, .decrypt = ctr_crypt, }, { .base.cra_name = "__xts(camellia)", .base.cra_driver_name = "__xts-camellia-aesni-avx2", .base.cra_priority = 500, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_xts_ctx), .base.cra_module = THIS_MODULE, .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE, .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE, .ivsize = CAMELLIA_BLOCK_SIZE, .setkey = xts_camellia_setkey, .encrypt = xts_encrypt, .decrypt = xts_decrypt, }, }; static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; static int __init camellia_aesni_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX) || !boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 or AES-NI instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(camellia_algs, ARRAY_SIZE(camellia_algs), camellia_simd_algs); } static void __exit camellia_aesni_fini(void) { simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), camellia_simd_algs); } module_init(camellia_aesni_init); module_exit(camellia_aesni_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized"); MODULE_ALIAS_CRYPTO("camellia"); MODULE_ALIAS_CRYPTO("camellia-asm"); |
1 1 46 2 4 13 42 42 42 42 14735 48 53 1 48 15 47 48 46 46 48 46 441 7386 7386 8588 8591 78 78 375 377 1201 1201 7 3 3 3 3 3 31 31 31 15 15 13 15 15 5 3 2 1 6 11 10 4 11 11 11 11 2 2 2 11 11 11 11 2 2 2 2 6 11 11 19 46 46 30 9 8 2 21 18 2 27 19 19 18 7 11 24 15 17 15 5 10 15 13 2 20 11 20 20 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 | /* * linux/kernel/time/timekeeping.c * * Kernel timekeeping code and accessor functions * * This code was moved from linux/kernel/timer.c. * Please see that file for copyright and history logs. * */ #include <linux/timekeeper_internal.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/nmi.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/sched/clock.h> #include <linux/syscore_ops.h> #include <linux/clocksource.h> #include <linux/jiffies.h> #include <linux/time.h> #include <linux/timex.h> #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_MIRROR (1 << 1) #define TK_CLOCK_WAS_SET (1 << 2) enum timekeeping_adv_mode { /* Update timekeeper when a tick has passed */ TK_ADV_TICK, /* Update timekeeper on a direct frequency change */ TK_ADV_FREQ }; /* * The most important data for readout fits into a single 64 byte * cache line. */ static struct { seqcount_t seq; struct timekeeper timekeeper; } tk_core ____cacheline_aligned = { .seq = SEQCNT_ZERO(tk_core.seq), }; static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; /** * struct tk_fast - NMI safe timekeeper * @seq: Sequence counter for protecting updates. The lowest bit * is the index for the tk_read_base array * @base: tk_read_base array. Access is indexed by the lowest bit of * @seq. * * See @update_fast_timekeeper() below. */ struct tk_fast { seqcount_t seq; struct tk_read_base base[2]; }; /* Suspend-time cycles value for halted fast timekeeper. */ static u64 cycles_at_suspend; static u64 dummy_clock_read(struct clocksource *cs) { return cycles_at_suspend; } static struct clocksource dummy_clock = { .read = dummy_clock_read, }; static struct tk_fast tk_fast_mono ____cacheline_aligned = { .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[0] = { .clock = &dummy_clock, }, .base[1] = { .clock = &dummy_clock, }, }; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; tk->raw_sec++; } } static inline struct timespec64 tk_xtime(const struct timekeeper *tk) { struct timespec64 ts; ts.tv_sec = tk->xtime_sec; ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); return ts; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) { struct timespec64 tmp; /* * Verify consistency of: offset_real = -wall_to_monotonic * before modifying anything */ set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec, -tk->wall_to_monotonic.tv_nsec); WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp)); tk->wall_to_monotonic = wtm; set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec64_to_ktime(tmp); tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) { tk->offs_boot = ktime_add(tk->offs_boot, delta); } /* * tk_clock_read - atomic clocksource read() helper * * This helper is necessary to use in the read paths because, while the * seqlock ensures we don't return a bad value while structures are updated, * it doesn't protect from potential crashes. There is the possibility that * the tkr's clocksource may change between the read reference, and the * clock reference passed to the read function. This can cause crashes if * the wrong clocksource is passed to the wrong read function. * This isn't necessary to use when holding the timekeeper_lock or doing * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ static inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ static void timekeeping_check_update(struct timekeeper *tk, u64 offset) { u64 max_cycles = tk->tkr_mono.clock->max_cycles; const char *name = tk->tkr_mono.clock->name; if (offset > max_cycles) { printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", offset, name, max_cycles); printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); } else { if (offset > (max_cycles >> 1)) { printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the '%s' clock's 50%% safety margin (%lld)\n", offset, name, max_cycles >> 1); printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); } } if (tk->underflow_seen) { if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); tk->last_warning = jiffies; } tk->underflow_seen = 0; } if (tk->overflow_seen) { if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); tk->last_warning = jiffies; } tk->overflow_seen = 0; } } static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) { struct timekeeper *tk = &tk_core.timekeeper; u64 now, last, mask, max, delta; unsigned int seq; /* * Since we're called holding a seqlock, the data may shift * under us while we're doing the calculation. This can cause * false positives, since we'd note a problem but throw the * results away. So nest another seqlock here to atomically * grab the points we are checking with. */ do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(tkr); last = tkr->cycle_last; mask = tkr->mask; max = tkr->clock->max_cycles; } while (read_seqcount_retry(&tk_core.seq, seq)); delta = clocksource_delta(now, last, mask); /* * Try to catch underflows by checking if we are seeing small * mask-relative negative values. */ if (unlikely((~delta & mask) < (mask >> 3))) { tk->underflow_seen = 1; delta = 0; } /* Cap delta value to the max_cycles values to avoid mult overflows */ if (unlikely(delta > max)) { tk->overflow_seen = 1; delta = tkr->clock->max_cycles; } return delta; } #else static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset) { } static inline u64 timekeeping_get_delta(const struct tk_read_base *tkr) { u64 cycle_now, delta; /* read clocksource */ cycle_now = tk_clock_read(tkr); /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); return delta; } #endif /** * tk_setup_internals - Set up internals to use clocksource clock. * * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment * pair and interval request. * * Unless you're the timekeeping code, you should not be using this! */ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) { u64 interval; u64 tmp, ntpinterval; struct clocksource *old_clock; ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.mask = clock->mask; tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) tmp = 1; interval = (u64) tmp; tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; tk->tkr_raw.xtime_nsec >>= -shift_change; } else { tk->tkr_mono.xtime_nsec <<= shift_change; tk->tkr_raw.xtime_nsec <<= shift_change; } } tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; tk->ntp_tick = ntpinterval << tk->ntp_error_shift; /* * The timekeeper keeps its own mult values for the currently * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ tk->tkr_mono.mult = clock->mult; tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; } /* Timekeeper helper functions. */ #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET static u32 default_arch_gettimeoffset(void) { return 0; } u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; #else static inline u32 arch_gettimeoffset(void) { return 0; } #endif static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta) { u64 nsec; nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); } static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) { u64 delta; delta = timekeeping_get_delta(tkr); return timekeeping_delta_to_ns(tkr, delta); } static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { u64 delta; /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); return timekeeping_delta_to_ns(tkr, delta); } /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. * * Employ the latch technique; see @raw_write_seqcount_latch. * * So if a NMI hits the update of base[0] then it will use base[1] * which is still consistent. In the worst case this can result is a * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ static void update_fast_timekeeper(const struct tk_read_base *tkr, struct tk_fast *tkf) { struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ raw_write_seqcount_latch(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ raw_write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); } /** * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic * * This timestamp is not guaranteed to be monotonic across an update. * The timestamp is calculated by: * * now = base_mono + clock_delta * slope * * So if the update lowers the slope, readers who are forced to the * not yet updated second array are still using the old steeper slope. * * tmono * ^ * | o n * | o n * | u * | o * |o * |12345678---> reader order * * o = old slope * u = update * n = new slope * * So reader 6 will observe time going backwards versus reader 5. * * While other CPUs are likely to be able observe that, the only way * for a CPU local observation is when an NMI hits in the middle of * the update. Timestamps taken from that NMI context might be ahead * of the following timestamps. Callers need to be aware of that and * deal with it. */ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base); now += timekeeping_delta_to_ns(tkr, clocksource_delta( tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); return now; } u64 ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); u64 ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); /** * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. * * To keep it NMI safe since we're accessing from tracing, we're not using a * separate timekeeper with updates to monotonic clock and boot offset * protected with seqlocks. This has the following minor side effects: * * (1) Its possible that a timestamp be taken after the boot offset is updated * but before the timekeeper is updated. If this happens, the new boot offset * is added to the old timekeeping making the clock appear to update slightly * earlier: * CPU 0 CPU 1 * timekeeping_inject_sleeptime64() * __timekeeping_inject_sleeptime(tk, delta); * timestamp(); * timekeeping_update(tk, TK_CLEAR_NTP...); * * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. */ u64 notrace ktime_get_boot_fast_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); /* * See comment for __ktime_get_fast_ns() vs. timestamp ordering */ static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base_real); now += timekeeping_delta_to_ns(tkr, clocksource_delta( tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); return now; } /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. */ u64 ktime_get_real_fast_ns(void) { return __ktime_get_real_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * * It generally is unsafe to access the clocksource after timekeeping has been * suspended, so take a snapshot of the readout base of @tk and use it as the * fast timekeeper's readout base while suspended. It will return the same * number of cycles every time until timekeeping is resumed at which time the * proper readout base for the fast timekeeper will be restored automatically. */ static void halt_fast_timekeeper(const struct timekeeper *tk) { static struct tk_read_base tkr_dummy; const struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tk_clock_read(tkr); tkr_dummy.clock = &dummy_clock; tkr_dummy.base_real = tkr->base + tk->offs_real; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) { raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk); } /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; int ret; raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); update_pvclock_gtod(tk, true); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { unsigned long flags; int ret; raw_spin_lock_irqsave(&timekeeper_lock, flags); ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return ret; } EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* * tk_update_leap_state - helper to update the next_leap_ktime */ static inline void tk_update_leap_state(struct timekeeper *tk) { tk->next_leap_ktime = ntp_get_next_leap(); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); } /* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) { u64 seconds; u32 nsec; /* * The xtime based monotonic readout is: * nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); * The ktime based monotonic readout is: * nsec = base_mono + now(); * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec */ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take * this into account before updating tk->ktime_sec. */ nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; /* Update the monotonic raw base */ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* must hold timekeeper_lock */ static void timekeeping_update(struct timekeeper *tk, unsigned int action) { if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; ntp_clear(); } tk_update_leap_state(tk); tk_update_ktime_data(tk); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; /* * The mirroring of the data to the shadow-timekeeper needs * to happen last here to ensure we don't over-write the * timekeeper structure on the next update with stale data */ if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); } /** * timekeeping_forward_now - update clock to the current time * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ static void timekeeping_forward_now(struct timekeeper *tk) { u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; /* If arch requires, add in get_arch_timeoffset() */ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; /* If arch requires, add in get_arch_timeoffset() */ tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; tk_normalize_xtime(tk); } /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * * Returns the time of day in a timespec64 (WARN if suspended). */ void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_real_ts64); ktime_t ktime_get(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get); u32 ktime_get_resolution_ns(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; u32 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return nsecs; } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, }; ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); /** * ktime_mono_to_any() - convert mononotic time to any other time * @tmono: time to convert. * @offs: which offset to use */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; unsigned long seq; ktime_t tconv; do { seq = read_seqcount_begin(&tk_core.seq); tconv = ktime_add(tmono, *offset); } while (read_seqcount_retry(&tk_core.seq, seq)); return tconv; } EXPORT_SYMBOL_GPL(ktime_mono_to_any); /** * ktime_get_raw - Returns the raw monotonic time in ktime_t format */ ktime_t ktime_get_raw(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_raw.base; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_raw); /** * ktime_get_ts64 - get the monotonic clock in timespec64 format * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result * in normalized timespec64 format in the variable pointed to by @ts. */ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; unsigned int seq; u64 nsec; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_sec += tomono.tv_sec; ts->tv_nsec = 0; timespec64_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts64); /** * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC * * Returns the seconds portion of CLOCK_MONOTONIC with a single non * serialized read. tk->ktime_sec is of type 'unsigned long' so this * works on both 32 and 64 bit systems. On 32 bit systems the readout * covers ~136 years of uptime which should be enough to prevent * premature wrap arounds. */ time64_t ktime_get_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; WARN_ON(timekeeping_suspended); return tk->ktime_sec; } EXPORT_SYMBOL_GPL(ktime_get_seconds); /** * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * * Returns the wall clock seconds since 1970. This replaces the * get_seconds() interface which is not y2038 safe on 32bit systems. * * For 64bit systems the fast access to tk->xtime_sec is preserved. On * 32bit systems the access must be protected with the sequence * counter to provide "atomic" access to the 64bit tk->xtime_sec * value. */ time64_t ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; time64_t seconds; unsigned int seq; if (IS_ENABLED(CONFIG_64BIT)) return tk->xtime_sec; do { seq = read_seqcount_begin(&tk_core.seq); seconds = tk->xtime_sec; } while (read_seqcount_retry(&tk_core.seq, seq)); return seconds; } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** * __ktime_get_real_seconds - The same as ktime_get_real_seconds * but without the sequence counter protect. This internal function * is called just when timekeeping lock is already held. */ time64_t __ktime_get_real_seconds(void) { struct timekeeper *tk = &tk_core.timekeeper; return tk->xtime_sec; } /** * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter * @systime_snapshot: pointer to struct receiving the system time snapshot */ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; ktime_t base_raw; ktime_t base_real; u64 nsec_raw; u64 nsec_real; u64 now; WARN_ON_ONCE(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); now = tk_clock_read(&tk->tkr_mono); systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); } while (read_seqcount_retry(&tk_core.seq, seq)); systime_snapshot->cycles = now; systime_snapshot->real = ktime_add_ns(base_real, nsec_real); systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); } EXPORT_SYMBOL_GPL(ktime_get_snapshot); /* Scale base by mult/div checking for overflow */ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) { u64 tmp, rem; tmp = div64_u64_rem(*base, div, &rem); if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) return -EOVERFLOW; tmp *= mult; rem = div64_u64(rem * mult, div); *base = tmp + rem; return 0; } /** * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval * @history: Snapshot representing start of history * @partial_history_cycles: Cycle offset into history (fractional part) * @total_history_cycles: Total history length in cycles * @discontinuity: True indicates clock was set on history period * @ts: Cross timestamp that should be adjusted using * partial/total ratio * * Helper function used by get_device_system_crosststamp() to correct the * crosstimestamp corresponding to the start of the current interval to the * system counter value (timestamp point) provided by the driver. The * total_history_* quantities are the total history starting at the provided * reference point and ending at the start of the current interval. The cycle * count between the driver timestamp point and the start of the current * interval is partial_history_cycles. */ static int adjust_historical_crosststamp(struct system_time_snapshot *history, u64 partial_history_cycles, u64 total_history_cycles, bool discontinuity, struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; u64 corr_raw, corr_real; bool interp_forward; int ret; if (total_history_cycles == 0 || partial_history_cycles == 0) return 0; /* Interpolate shortest distance from beginning or end of history */ interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; /* * Scale the monotonic raw time delta by: * partial_history_cycles / total_history_cycles */ corr_raw = (u64)ktime_to_ns( ktime_sub(ts->sys_monoraw, history->raw)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_raw); if (ret) return ret; /* * If there is a discontinuity in the history, scale monotonic raw * correction by: * mult(real)/mult(raw) yielding the realtime correction * Otherwise, calculate the realtime correction similar to monotonic * raw calculation */ if (discontinuity) { corr_real = mul_u64_u32_div (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); } else { corr_real = (u64)ktime_to_ns( ktime_sub(ts->sys_realtime, history->real)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_real); if (ret) return ret; } /* Fixup monotonic raw and real time time values */ if (interp_forward) { ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); ts->sys_realtime = ktime_add_ns(history->real, corr_real); } else { ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); } return 0; } /* * cycle_between - true if test occurs chronologically between before and after */ static bool cycle_between(u64 before, u64 test, u64 after) { if (test > before && test < after) return true; if (test < before && before > after) return true; return false; } /** * get_device_system_crosststamp - Synchronously capture system/device timestamp * @get_time_fn: Callback to get simultaneous device time and * system counter from the device driver * @ctx: Context passed to get_time_fn() * @history_begin: Historical reference point used to interpolate system * time when counter provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time */ int get_device_system_crosststamp(int (*get_time_fn) (ktime_t *device_time, struct system_counterval_t *sys_counterval, void *ctx), void *ctx, struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { struct system_counterval_t system_counterval; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned long seq; bool do_interp; int ret; do { seq = read_seqcount_begin(&tk_core.seq); /* * Try to synchronously capture device time and a system * counter value calling back into the device driver */ ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); if (ret) return ret; /* * Verify that the clocksource associated with the captured * system counter value is the same as the currently installed * timekeeper clocksource */ if (tk->tkr_mono.clock != system_counterval.cs) return -ENODEV; cycles = system_counterval.cycles; /* * Check whether the system counter value provided by the * device driver is on the current timekeeping interval. */ now = tk_clock_read(&tk->tkr_mono); interval_start = tk->tkr_mono.cycle_last; if (!cycle_between(interval_start, cycles, now)) { clock_was_set_seq = tk->clock_was_set_seq; cs_was_changed_seq = tk->cs_was_changed_seq; cycles = interval_start; do_interp = true; } else { do_interp = false; } base_real = ktime_add(tk->tkr_mono.base, tk_core.timekeeper.offs_real); base_raw = tk->tkr_raw.base; nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, system_counterval.cycles); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, system_counterval.cycles); } while (read_seqcount_retry(&tk_core.seq, seq)); xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); /* * Interpolate if necessary, adjusting back from the start of the * current interval */ if (do_interp) { u64 partial_history_cycles, total_history_cycles; bool discontinuity; /* * Check that the counter value occurs after the provided * history reference and that the history doesn't cross a * clocksource change */ if (!history_begin || !cycle_between(history_begin->cycles, system_counterval.cycles, cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; partial_history_cycles = cycles - system_counterval.cycles; total_history_cycles = cycles - history_begin->cycles; discontinuity = history_begin->clock_was_set_seq != clock_was_set_seq; ret = adjust_historical_crosststamp(history_begin, partial_history_cycles, total_history_cycles, discontinuity, xtstamp); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * do_settimeofday64 - Sets the time of day. * @ts: pointer to the timespec64 variable containing the new time * * Sets the time of day to the new time and update NTP and notify hrtimers */ int do_settimeofday64(const struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts_delta, xt; unsigned long flags; int ret = 0; if (!timespec64_valid_settod(ts)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); xt = tk_xtime(tk); ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { ret = -EINVAL; goto out; } tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); tk_set_xtime(tk, ts); out: timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); return ret; } EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. * @tv: pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ static int timekeeping_inject_offset(const struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; struct timespec64 tmp; int ret = 0; if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tk), *ts); if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { ret = -EINVAL; goto error; } tk_xtime_add(tk, ts); tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts)); error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); return ret; } /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. */ int persistent_clock_is_local; /* * Adjust the time obtained from the CMOS to be UTC time instead of * local time. * * This is ugly, but preferable to the alternatives. Otherwise we * would either need to write a program to do it in /etc/rc (and risk * confusion if the program gets run more than once; it would also be * hard to make the program warp the clock precisely n hours) or * compile in the timezone information into the kernel. Bad, bad.... * * - TYT, 1992-01-01 * * The best thing to do is to keep the CMOS clock in universal time (UTC) * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ void timekeeping_warp_clock(void) { if (sys_tz.tz_minuteswest != 0) { struct timespec64 adjust; persistent_clock_is_local = 1; adjust.tv_sec = sys_tz.tz_minuteswest * 60; adjust.tv_nsec = 0; timekeeping_inject_offset(&adjust); } } /** * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic * */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /** * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource */ static int change_clocksource(void *data) { struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *new, *old; unsigned long flags; new = (struct clocksource *) data; raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); /* * If the cs is in module, get a module reference. Succeeds * for built-in code (owner == NULL) as well. */ if (try_module_get(new->owner)) { if (!new->enable || new->enable(new) == 0) { old = tk->tkr_mono.clock; tk_setup_internals(tk, new); if (old->disable) old->disable(old); module_put(old->owner); } else { module_put(new->owner); } } timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); return 0; } /** * timekeeping_notify - Install a new clock source * @clock: pointer to the clock source * * This function is called from clocksource.c after a new, better clock * source has been registered. The caller holds the clocksource_mutex. */ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } /** * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec * @ts: pointer to the timespec64 to be set * * Returns the raw monotonic time (completely un-modified by ntp) */ void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); ts->tv_nsec = 0; timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(ktime_get_raw_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; int ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred */ u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; u64 ret; do { seq = read_seqcount_begin(&tk_core.seq); ret = tk->tkr_mono.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); return ret; } /** * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ void __weak read_persistent_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } void __weak read_persistent_clock64(struct timespec64 *ts64) { struct timespec ts; read_persistent_clock(&ts); *ts64 = timespec_to_timespec64(ts); } /** * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset * from the boot. * * Weak dummy function for arches that do not yet support it. * wall_time - current time as returned by persistent clock * boot_offset - offset that is defined as wall_time - boot_time * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't * support dedicated boot time clock will provide the best estimate of the * boot time. */ void __weak __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { read_persistent_clock64(wall_time); *boot_offset = ns_to_timespec64(local_clock()); } /* * Flag reflecting whether timekeeping_resume() has injected sleeptime. * * The flag starts of false and is only set when a suspend reaches * timekeeping_suspend(), timekeeping_resume() sets it to false when the * timekeeper clocksource is not stopping across suspend and has been * used to update sleep time. If the timekeeper clocksource has stopped * then the flag stays true and is used by the RTC resume code to decide * whether sleeptime must be injected and if so the flag gets false then. * * If a suspend fails before reaching timekeeping_resume() then the flag * stays false and prevents erroneous sleeptime injection. */ static bool suspend_timing_needed; /* Flag for if there is a persistent clock on this platform */ static bool persistent_clock_exists; /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ void __init timekeeping_init(void) { struct timespec64 wall_time, boot_offset, wall_to_mono; struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock; unsigned long flags; read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { pr_warn("Persistent clock returned invalid value"); wall_time = (struct timespec64){0}; } if (timespec64_compare(&wall_time, &boot_offset) < 0) boot_offset = (struct timespec64){0}; /* * We want set wall_to_mono, so the following is true: * wall time + wall_to_mono = boot time */ wall_to_mono = timespec64_sub(boot_offset, wall_time); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); ntp_init(); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); tk_setup_internals(tk, clock); tk_set_xtime(tk, &wall_time); tk->raw_sec = 0; tk_set_wall_to_mono(tk, wall_to_mono); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } /* time in seconds when suspend began for persistent clock */ static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value * * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. */ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, const struct timespec64 *delta) { if (!timespec64_valid_strict(delta)) { printk_deferred(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); tk_update_sleep_time(tk, timespec64_to_ktime(*delta)); tk_debug_account_sleep_time(delta); } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) /** * We have three kinds of time sources to use for sleep time * injection, the preference order is: * 1) non-stop clocksource * 2) persistent clock (ie: RTC accessible when irqs are off) * 3) RTC * * 1) and 2) are used by timekeeping, 3) by RTC subsystem. * If system has neither 1) nor 2), 3) will be used finally. * * * If timekeeping has injected sleeptime via either 1) or 2), * 3) becomes needless, so in this case we don't need to call * rtc_resume(), and this is what timekeeping_rtc_skipresume() * means. */ bool timekeeping_rtc_skipresume(void) { return !suspend_timing_needed; } /** * 1) can be determined whether to use or not only when doing * timekeeping_resume() which is invoked after rtc_suspend(), * so we can't skip rtc_suspend() surely if system has 1). * * But if system has 2), 2) will definitely be used, so in this * case we don't need to call rtc_suspend(), and this is what * timekeeping_rtc_skipsuspend() means. */ bool timekeeping_rtc_skipsuspend(void) { return persistent_clock_exists; } /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. */ void timekeeping_inject_sleeptime64(const struct timespec64 *delta) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); suspend_timing_needed = false; timekeeping_forward_now(tk); __timekeeping_inject_sleeptime(tk, delta); timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* signal hrtimers about time change */ clock_was_set(); } #endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. */ void timekeeping_resume(void) { struct timekeeper *tk = &tk_core.timekeeper; struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; u64 cycle_now, nsec; bool inject_sleeptime = false; read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); /* * After system resumes, we need to calculate the suspended time and * compensate it for the OS time. There are 3 sources that could be * used: Nonstop clocksource during suspend, persistent clock and rtc * device. * * One specific platform may have 1 or 2 or all of them, and the * preference will be: * suspend-nonstop clocksource -> persistent clock -> rtc * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ cycle_now = tk_clock_read(&tk->tkr_mono); nsec = clocksource_stop_suspend_timing(clock, cycle_now); if (nsec > 0) { ts_delta = ns_to_timespec64(nsec); inject_sleeptime = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); inject_sleeptime = true; } if (inject_sleeptime) { suspend_timing_needed = false; __timekeeping_inject_sleeptime(tk, &ts_delta); } /* Re-base the last cycle value */ tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); touch_softlockup_watchdog(); tick_resume(); hrtimers_resume(); } int timekeeping_suspend(void) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; struct clocksource *curr_clock; u64 cycle_now; read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at * timekeeping_init by its return value, so if we see a valid * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) persistent_clock_exists = true; suspend_timing_needed = true; raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; /* * Since we've called forward_now, cycle_last stores the value * just read from the current clocksource. Save this to potentially * use in suspend timing. */ curr_clock = tk->tkr_mono.clock; cycle_now = tk->tkr_mono.cycle_last; clocksource_start_suspend_timing(curr_clock, cycle_now); if (persistent_clock_exists) { /* * To avoid drift caused by repeated suspend/resumes, * which each can add ~1 second drift error, * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); delta_delta = timespec64_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* * if delta_delta is too large, assume time correction * has occurred and set old_delta to the current delta. */ old_delta = delta; } else { /* Otherwise try to adjust old_system to compensate */ timekeeping_suspend_time = timespec64_add(timekeeping_suspend_time, delta_delta); } } timekeeping_update(tk, TK_MIRROR); halt_fast_timekeeper(tk); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); tick_suspend(); clocksource_suspend(); clockevents_suspend(); return 0; } /* sysfs resume/suspend bits for timekeeping */ static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; static int __init timekeeping_init_ops(void) { register_syscore_ops(&timekeeping_syscore_ops); return 0; } device_initcall(timekeeping_init_ops); /* * Apply a multiplier adjustment to the timekeeper */ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, s64 offset, s32 mult_adj) { s64 interval = tk->cycle_interval; if (mult_adj == 0) { return; } else if (mult_adj == -1) { interval = -interval; offset = -offset; } else if (mult_adj != 1) { interval *= mult_adj; offset *= mult_adj; } /* * So the following can be confusing. * * To keep things simple, lets assume mult_adj == 1 for now. * * When mult_adj != 1, remember that the interval and offset values * have been appropriately scaled so the math is the same. * * The basic idea here is that we're increasing the multiplier * by one, this causes the xtime_interval to be incremented by * one cycle_interval. This is because: * xtime_interval = cycle_interval * mult * So if mult is being incremented by one: * xtime_interval = cycle_interval * (mult + 1) * Its the same as: * xtime_interval = (cycle_interval * mult) + cycle_interval * Which can be shortened to: * xtime_interval += cycle_interval * * So offset stores the non-accumulated cycles. Thus the current * time (in shifted nanoseconds) is: * now = (offset * adj) + xtime_nsec * Now, even though we're adjusting the clock frequency, we have * to keep time consistent. In other words, we can't jump back * in time, and we also want to avoid jumping forward in time. * * So given the same offset value, we need the time to be the same * both before and after the freq adjustment. * now = (offset * adj_1) + xtime_nsec_1 * now = (offset * adj_2) + xtime_nsec_2 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_2) + xtime_nsec_2 * And we know: * adj_2 = adj_1 + 1 * So: * (offset * adj_1) + xtime_nsec_1 = * (offset * (adj_1+1)) + xtime_nsec_2 * (offset * adj_1) + xtime_nsec_1 = * (offset * adj_1) + offset + xtime_nsec_2 * Canceling the sides: * xtime_nsec_1 = offset + xtime_nsec_2 * Which gives us: * xtime_nsec_2 = xtime_nsec_1 - offset * Which simplfies to: * xtime_nsec -= offset */ if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ WARN_ON_ONCE(1); return; } tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; tk->tkr_mono.xtime_nsec -= offset; } /* * Adjust the timekeeper's multiplier to the correct frequency * and also to reduce the accumulated error value. */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { u32 mult; /* * Determine the multiplier from the current NTP tick length. * Avoid expensive division when the tick length doesn't change. */ if (likely(tk->ntp_tick == ntp_tick_length())) { mult = tk->tkr_mono.mult - tk->ntp_err_mult; } else { tk->ntp_tick = ntp_tick_length(); mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) - tk->xtime_remainder, tk->cycle_interval); } /* * If the clock is behind the NTP time, increase the multiplier by 1 * to catch up with it. If it's ahead and there was a remainder in the * tick division, the clock will slow down. Otherwise it will stay * ahead until the tick length changes to a non-divisible value. */ tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0; mult += tk->ntp_err_mult; timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult); if (unlikely(tk->tkr_mono.clock->maxadj && (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) > tk->tkr_mono.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); } /* * It may be possible that when we entered this function, xtime_nsec * was very small. Further, if we're slightly speeding the clocksource * in the code above, its possible the required corrective factor to * xtime_nsec could cause it to underflow. * * Now, since we have already accumulated the second and the NTP * subsystem has been notified via second_overflow(), we need to skip * the next update. */ if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec--; tk->skip_second_overflow = 1; } } /** * accumulate_nsecs_to_secs - Accumulates nsecs into secs * * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. * */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* * Skip NTP update if this second was accumulated before, * i.e. xtime_nsec underflowed in timekeeping_adjust() */ if (unlikely(tk->skip_second_overflow)) { tk->skip_second_overflow = 0; continue; } /* Figure out if its a leap sec and apply if needed */ leap = second_overflow(tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; tk->xtime_sec += leap; ts.tv_sec = leap; ts.tv_nsec = 0; tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts)); __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); clock_set = TK_CLOCK_WAS_SET; } } return clock_set; } /** * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into * into a shifted interval nanoseconds. Allows for O(log) accumulation * loop. * * Returns the unconsumed cycles. */ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, u32 shift, unsigned int *clock_set) { u64 interval = tk->cycle_interval << shift; u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) return offset; /* Accumulate one shifted interval */ offset -= interval; tk->tkr_mono.cycle_last += interval; tk->tkr_raw.cycle_last += interval; tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; tk->raw_sec++; } /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) << (tk->ntp_error_shift + shift); return offset; } /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ static void timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *real_tk = &tk_core.timekeeper; struct timekeeper *tk = &shadow_timekeeper; u64 offset; int shift = 0, maxshift; unsigned int clock_set = 0; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) goto out; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; if (mode != TK_ADV_TICK) goto out; #else offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) goto out; #endif /* Do some additional sanity checking */ timekeeping_check_update(tk, offset); /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, * we calculate the largest doubling multiple of cycle_intervals * that is smaller than the offset. We then accumulate that * chunk in one go, and then try to consume the next smaller * doubled multiple. */ shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); /* * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ clock_set |= accumulate_nsecs_to_secs(tk); write_seqcount_begin(&tk_core.seq); /* * Update the real timekeeper. * * We could avoid this memcpy by switching pointers, but that * requires changes to all other timekeeper usage sites as * well, i.e. move the timekeeper pointer getter into the * spinlocked/seqcount protected sections. And we trade this * memcpy under the tk_core.seq against one before we start * updating. */ timekeeping_update(tk, clock_set); memcpy(real_tk, tk, sizeof(*tk)); /* The memcpy must come last. Do not put anything here! */ write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); if (clock_set) /* Have to call _delayed version, since in irq context*/ clock_was_set_delayed(); } /** * update_wall_time - Uses the current clocksource to increment the wall time * */ void update_wall_time(void) { timekeeping_advance(TK_ADV_TICK); } /** * getboottime64 - Return the real time of system boot. * @ts: pointer to the timespec64 to be set * * Returns the wall-time of boot in a timespec64. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which * basically means that however wrong your real time clock is at boot time, * you get the right time here). */ void getboottime64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); *ts = ktime_to_timespec64(t); } EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; do { seq = read_seqcount_begin(&tk_core.seq); *ts = tk_xtime(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; unsigned long seq; do { seq = read_seqcount_begin(&tk_core.seq); now = tk_xtime(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); /* * Must hold jiffies_lock */ void do_timer(unsigned long ticks) { jiffies_64 += ticks; calc_global_load(ticks); } /** * ktime_get_update_offsets_now - hrtimer helper * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets if the * sequence number in @cwsseq and timekeeper.clock_was_set_seq are * different. * * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); base = ktime_add_ns(base, nsecs); if (*cwsseq != tk->clock_was_set_seq) { *cwsseq = tk->clock_was_set_seq; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; *offs_tai = tk->offs_tai; } /* Handle leapsecond insertion adjustments */ if (unlikely(base >= tk->next_leap_ktime)) *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); } while (read_seqcount_retry(&tk_core.seq, seq)); return base; } /** * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ static int timekeeping_validate_timex(const struct timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; if (!(txc->modes & ADJ_OFFSET_READONLY) && !capable(CAP_SYS_TIME)) return -EPERM; } else { /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; /* * if the quartz is off by more than 10% then * something is VERY wrong! */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; } if (txc->modes & ADJ_SETOFFSET) { /* In order to inject time, you gotta be super-user! */ if (!capable(CAP_SYS_TIME)) return -EPERM; /* * Validate if a timespec/timeval used to inject a time * offset is valid. Offsets can be postive or negative, so * we don't check tv_sec. The value of the timeval/timespec * is the sum of its fields,but *NOTE*: * The field tv_usec/tv_nsec must always be non-negative and * we can't have more nanoseconds/microseconds than a second. */ if (txc->time.tv_usec < 0) return -EINVAL; if (txc->modes & ADJ_NANO) { if (txc->time.tv_usec >= NSEC_PER_SEC) return -EINVAL; } else { if (txc->time.tv_usec >= USEC_PER_SEC) return -EINVAL; } } /* * Check for potential multiplication overflows that can * only happen on 64-bit systems: */ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } return 0; } /** * random_get_entropy_fallback - Returns the raw clock source value, * used by random.c for platforms with no valid random_get_entropy(). */ unsigned long random_get_entropy_fallback(void) { struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono; struct clocksource *clock = READ_ONCE(tkr->clock); if (unlikely(timekeeping_suspended || !clock)) return 0; return clock->read(clock); } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ int do_adjtimex(struct timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; struct timespec64 ts; s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ ret = timekeeping_validate_timex(txc); if (ret) return ret; if (txc->modes & ADJ_SETOFFSET) { struct timespec64 delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; ret = timekeeping_inject_offset(&delta); if (ret) return ret; } ktime_get_real_ts64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; ret = __do_adjtimex(txc, &ts, &tai); if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } tk_update_leap_state(tk); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) timekeeping_advance(TK_ADV_FREQ); if (tai != orig_tai) clock_was_set(); ntp_notify_cmos_timer(); return ret; } #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function */ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); __hardpps(phase_ts, raw_ts); write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ /** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. * * Must be called with interrupts disabled. */ void xtime_update(unsigned long ticks) { write_seqlock(&jiffies_lock); do_timer(ticks); write_sequnlock(&jiffies_lock); update_wall_time(); } |
3400 3289 3398 3921 612 3924 3400 3400 3354 3352 2 3351 2196 867 2193 880 2195 764 699 78 55 2 54 3 76 785 784 783 785 811 85 811 785 28 93 33 13 821 815 818 1 810 820 816 24 90 93 24 21 1 1357 1355 1054 1053 1353 65 65 65 2396 2525 2871 573 573 573 93 574 237 237 3 2 453 475 469 469 469 1 467 468 469 417 415 413 11 402 316 312 233 136 186 41 36 394 398 413 354 354 350 192 194 69 69 64 63 64 45 44 42 26 42 6 5 2 6 35 42 45 26 16 12 23 19 61 25 61 10 16 60 59 50 27 30 10 9 45 44 4 45 25 24 25 17 15 27 27 22 21 22 2 1 1 1 1 1 1 2 1 1 1 94 94 94 56 58 9 6 5 97 94 94 33 94 82 81 4 79 93 91 83 97 2 27 27 29 40 37 12 37 2 1 27 28 27 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 | /* File: fs/xattr.c Extended attribute handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com> Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/evm.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fsnotify.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> static const char * strcmp_prefix(const char *a, const char *a_prefix) { while (*a_prefix && *a == *a_prefix) { a++; a_prefix++; } return *a_prefix ? NULL : a; } /* * In order to implement different sets of xattr operations for each xattr * prefix, a filesystem should create a null-terminated array of struct * xattr_handler (one for each prefix) and hang a pointer to it off of the * s_xattr field of the superblock. */ #define for_each_xattr_handler(handlers, handler) \ if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) /* * Find the xattr_handler with the matching prefix. */ static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { const struct xattr_handler **handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return ERR_PTR(-EIO); return ERR_PTR(-EOPNOTSUPP); } for_each_xattr_handler(handlers, handler) { const char *n; n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { if (!handler->prefix ^ !*n) { if (*n) continue; return ERR_PTR(-EINVAL); } *name = n; return handler; } } return ERR_PTR(-EOPNOTSUPP); } /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. */ static int xattr_permission(struct inode *inode, const char *name, int mask) { /* * We can never set or remove an extended attribute on a read-only * filesystem or on an immutable / append-only inode. */ if (mask & MAY_WRITE) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; /* * Updating an xattr will likely cause i_uid and i_gid * to be writen back improperly if their true value is * unknown to the vfs. */ if (HAS_UNMAPPED_ID(inode)) return -EPERM; } /* * No restriction for security.* and system.* from the VFS. Decision * on these is left to the underlying filesystem / security module. */ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return 0; /* * The trusted.* namespace can only be accessed by privileged users. */ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { if (!capable(CAP_SYS_ADMIN)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; return 0; } /* * In the user.* namespace, only regular files and directories can have * extended attributes. For sticky directories, only the owner and * privileged users can write attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) return -EPERM; } return inode_permission(inode, mask); } int __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { const struct xattr_handler *handler; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ return handler->set(handler, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); /** * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * * @dentry - object to perform setxattr on * @name - xattr name to set * @value - value to set @name to * @size - size of @value * @flags - flags to pass into filesystem operations * * returns the result of the internal setxattr or setsecurity operations. * * This function requires the caller to lock the inode's i_mutex before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; int issec = !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { error = __vfs_setxattr(dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, size, flags); } } else { if (unlikely(is_bad_inode(inode))) return -EIO; } if (error == -EAGAIN) { error = -EOPNOTSUPP; if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); if (!error) fsnotify_xattr(dentry); } } return error; } /** * __vfs_setxattr_locked: set an extended attribute while holding the inode * lock * * @dentry - object to perform setxattr on * @name - xattr name to set * @value - value to set @name to * @size - size of @value * @flags - flags to pass into filesystem operations * @delegated_inode - on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_setxattr_locked(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(inode, name, MAY_WRITE); if (error) return error; error = security_inode_setxattr(dentry, name, value, size, flags); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_setxattr_noperm(dentry, name, value, size, flags); out: return error; } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int vfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; int error; retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t xattr_getsecurity(struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { len = security_inode_getsecurity(inode, name, &buffer, false); goto out_noalloc; } len = security_inode_getsecurity(inode, name, &buffer, true); if (len < 0) return len; if (size < len) { len = -ERANGE; goto out; } memcpy(value, buffer, len); out: kfree(buffer); out_noalloc: return len; } /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, * before retrieving the extended attribute. * * Returns the result of alloc, if failed, or the getxattr operation. */ ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; error = xattr_permission(inode, name, MAY_READ); if (error) return error; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; error = handler->get(handler, dentry, inode, name, NULL, 0); if (error < 0) return error; if (!value || (error > xattr_size)) { value = krealloc(*xattr_value, error + 1, flags); if (!value) return -ENOMEM; memset(value, 0, error + 1); } error = handler->get(handler, dentry, inode, name, value, error); *xattr_value = value; return error; } ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const struct xattr_handler *handler; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; return handler->get(handler, dentry, inode, name, value, size); } EXPORT_SYMBOL(__vfs_getxattr); ssize_t vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(inode, name, MAY_READ); if (error) return error; error = security_inode_getxattr(dentry, name); if (error) return error; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; int ret = xattr_getsecurity(inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. */ if (ret == -EOPNOTSUPP) goto nolsm; return ret; } nolsm: return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { struct inode *inode = d_inode(dentry); ssize_t error; error = security_inode_listxattr(dentry); if (error) return error; if (inode->i_op->listxattr && (inode->i_opflags & IOP_XATTR)) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); if (size && error > size) error = -ERANGE; } return error; } EXPORT_SYMBOL_GPL(vfs_listxattr); int __vfs_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); /** * __vfs_removexattr_locked: set an extended attribute while holding the inode * lock * * @dentry - object to perform setxattr on * @name - name of xattr to remove * @delegated_inode - on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_removexattr_locked(struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(inode, name, MAY_WRITE); if (error) return error; error = security_inode_removexattr(dentry, name); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_removexattr(dentry, name); if (!error) { fsnotify_xattr(dentry); evm_inode_post_removexattr(dentry, name); } out: return error; } EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int vfs_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; int error; retry_deleg: inode_lock(inode); error = __vfs_removexattr_locked(dentry, name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); /* * Extended attribute SET operations */ static long setxattr(struct dentry *d, const char __user *name, const void __user *value, size_t size, int flags) { int error; void *kvalue = NULL; char kname[XATTR_NAME_MAX + 1]; if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) error = -ERANGE; if (error < 0) return error; if (size) { if (size > XATTR_SIZE_MAX) return -E2BIG; kvalue = kvmalloc(size, GFP_KERNEL); if (!kvalue) return -ENOMEM; if (copy_from_user(kvalue, value, size)) { error = -EFAULT; goto out; } if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) posix_acl_fix_xattr_from_user(kvalue, size); else if (strcmp(kname, XATTR_NAME_CAPS) == 0) { error = cap_convert_nscap(d, &kvalue, size); if (error < 0) goto out; size = error; } } error = vfs_setxattr(d, kname, kvalue, size, flags); out: kvfree(kvalue); return error; } static int path_setxattr(const char __user *pathname, const char __user *name, const void __user *value, size_t size, int flags, unsigned int lookup_flags) { struct path path; int error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { error = setxattr(path.dentry, name, value, size, flags); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattr(pathname, name, value, size, flags, 0); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { struct fd f = fdget(fd); int error = -EBADF; if (!f.file) return error; audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { error = setxattr(f.file->f_path.dentry, name, value, size, flags); mnt_drop_write_file(f.file); } fdput(f); return error; } /* * Extended attribute GET operations */ static ssize_t getxattr(struct dentry *d, const char __user *name, void __user *value, size_t size) { ssize_t error; void *kvalue = NULL; char kname[XATTR_NAME_MAX + 1]; error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) error = -ERANGE; if (error < 0) return error; if (size) { if (size > XATTR_SIZE_MAX) size = XATTR_SIZE_MAX; kvalue = kvzalloc(size, GFP_KERNEL); if (!kvalue) return -ENOMEM; } error = vfs_getxattr(d, kname, kvalue, size); if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) posix_acl_fix_xattr_to_user(kvalue, error); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(kvalue); return error; } static ssize_t path_getxattr(const char __user *pathname, const char __user *name, void __user *value, size_t size, unsigned int lookup_flags) { struct path path; ssize_t error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = getxattr(path.dentry, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattr(pathname, name, value, size, 0); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { struct fd f = fdget(fd); ssize_t error = -EBADF; if (!f.file) return error; audit_file(f.file); error = getxattr(f.file->f_path.dentry, name, value, size); fdput(f); return error; } /* * Extended attribute LIST operations */ static ssize_t listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kvmalloc(size, GFP_KERNEL); if (!klist) return -ENOMEM; } error = vfs_listxattr(d, klist, size); if (error > 0) { if (size && copy_to_user(list, klist, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { /* The file system tried to returned a list bigger than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(klist); return error; } static ssize_t path_listxattr(const char __user *pathname, char __user *list, size_t size, unsigned int lookup_flags) { struct path path; ssize_t error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattr(pathname, list, size, LOOKUP_FOLLOW); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattr(pathname, list, size, 0); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { struct fd f = fdget(fd); ssize_t error = -EBADF; if (!f.file) return error; audit_file(f.file); error = listxattr(f.file->f_path.dentry, list, size); fdput(f); return error; } /* * Extended attribute REMOVE operations */ static long removexattr(struct dentry *d, const char __user *name) { int error; char kname[XATTR_NAME_MAX + 1]; error = strncpy_from_user(kname, name, sizeof(kname)); if (error == 0 || error == sizeof(kname)) error = -ERANGE; if (error < 0) return error; return vfs_removexattr(d, kname); } static int path_removexattr(const char __user *pathname, const char __user *name, unsigned int lookup_flags) { struct path path; int error; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { error = removexattr(path.dentry, name); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } return error; } SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { return path_removexattr(pathname, name, LOOKUP_FOLLOW); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { return path_removexattr(pathname, name, 0); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { struct fd f = fdget(fd); int error = -EBADF; if (!f.file) return error; audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { error = removexattr(f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } fdput(f); return error; } /* * Combine the results of the list() operation from every xattr_handler in the * list. */ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; unsigned int size = 0; if (!buffer) { for_each_xattr_handler(handlers, handler) { if (!handler->name || (handler->list && !handler->list(dentry))) continue; size += strlen(handler->name) + 1; } } else { char *buf = buffer; size_t len; for_each_xattr_handler(handlers, handler) { if (!handler->name || (handler->list && !handler->list(dentry))) continue; len = strlen(handler->name); if (len + 1 > buffer_size) return -ERANGE; memcpy(buf, handler->name, len + 1); buf += len + 1; buffer_size -= len + 1; } size = buf - buffer; } return size; } EXPORT_SYMBOL(generic_listxattr); /** * xattr_full_name - Compute full attribute name from suffix * * @handler: handler of the xattr_handler operation * @name: name passed to the xattr_handler operation * * The get and set xattr handler operations are called with the remainder of * the attribute name after skipping the handler's prefix: for example, "foo" * is passed to the get operation of a handler with prefix "user." to get * attribute "user.foo". The full name is still "there" in the name though. * * Note: the list xattr handler operation when called from the vfs is passed a * NULL name; some file systems use this operation internally, with varying * semantics. */ const char *xattr_full_name(const struct xattr_handler *handler, const char *name) { size_t prefix_len = strlen(xattr_prefix(handler)); return name - prefix_len; } EXPORT_SYMBOL(xattr_full_name); /* * Allocate new xattr and copy in the value; but leave the name to callers. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) return NULL; new_xattr = kmalloc(len, GFP_KERNEL); if (!new_xattr) return NULL; new_xattr->size = size; memcpy(new_xattr->value, value, size); return new_xattr; } /* * xattr GET operation for in-memory/pseudo filesystems */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr; int ret = -ENODATA; spin_lock(&xattrs->lock); list_for_each_entry(xattr, &xattrs->head, list) { if (strcmp(name, xattr->name)) continue; ret = xattr->size; if (buffer) { if (size < xattr->size) ret = -ERANGE; else memcpy(buffer, xattr->value, xattr->size); } break; } spin_unlock(&xattrs->lock); return ret; } /** * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems * @xattrs: target simple_xattr list * @name: name of the extended attribute * @value: value of the xattr. If %NULL, will remove the attribute. * @size: size of the new xattr * @flags: %XATTR_{CREATE|REPLACE} * * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; * otherwise, fails with -ENODATA. * * Returns 0 on success, -errno on failure. */ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *xattr; struct simple_xattr *new_xattr = NULL; int err = 0; /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) return -ENOMEM; new_xattr->name = kstrdup(name, GFP_KERNEL); if (!new_xattr->name) { kfree(new_xattr); return -ENOMEM; } } spin_lock(&xattrs->lock); list_for_each_entry(xattr, &xattrs->head, list) { if (!strcmp(name, xattr->name)) { if (flags & XATTR_CREATE) { xattr = new_xattr; err = -EEXIST; } else if (new_xattr) { list_replace(&xattr->list, &new_xattr->list); } else { list_del(&xattr->list); } goto out; } } if (flags & XATTR_REPLACE) { xattr = new_xattr; err = -ENODATA; } else { list_add(&new_xattr->list, &xattrs->head); xattr = NULL; } out: spin_unlock(&xattrs->lock); if (xattr) { kfree(xattr->name); kfree(xattr); } return err; } static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } static int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) { size_t len = strlen(name) + 1; if (*buffer) { if (*remaining_size < len) return -ERANGE; memcpy(*buffer, name, len); *buffer += len; } *remaining_size -= len; return 0; } /* * xattr LIST operation for in-memory/pseudo filesystems */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = capable(CAP_SYS_ADMIN); struct simple_xattr *xattr; ssize_t remaining_size = size; int err = 0; #ifdef CONFIG_FS_POSIX_ACL if (IS_POSIXACL(inode)) { if (inode->i_acl) { err = xattr_list_one(&buffer, &remaining_size, XATTR_NAME_POSIX_ACL_ACCESS); if (err) return err; } if (inode->i_default_acl) { err = xattr_list_one(&buffer, &remaining_size, XATTR_NAME_POSIX_ACL_DEFAULT); if (err) return err; } } #endif spin_lock(&xattrs->lock); list_for_each_entry(xattr, &xattrs->head, list) { /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; } spin_unlock(&xattrs->lock); return err ? err : size - remaining_size; } /* * Adds an extended attribute to the list */ void simple_xattr_list_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { spin_lock(&xattrs->lock); list_add(&new_xattr->list, &xattrs->head); spin_unlock(&xattrs->lock); } |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/xattr_user.c * * Vyacheslav Dubeyko <slava@dubeyko.com> * * Handler for user extended attributes. */ #include <linux/nls.h> #include "hfsplus_fs.h" #include "xattr.h" static int hfsplus_user_getxattr(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { return hfsplus_getxattr(inode, name, buffer, size, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } static int hfsplus_user_setxattr(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } const struct xattr_handler hfsplus_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .get = hfsplus_user_getxattr, .set = hfsplus_user_setxattr, }; |
58 59 8 59 59 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/util.c */ #include <linux/time.h> #include "isofs.h" /* * We have to convert from a MM/DD/YY format to the Unix ctime format. * We have to take into account leap years and all of that good stuff. * Unfortunately, the kernel does not have the information on hand to * take into account daylight savings time, but it shouldn't matter. * The time stored should be localtime (with or without DST in effect), * and the timezone offset should hold the offset required to get back * to GMT. Thus we should always be correct. */ int iso_date(u8 *p, int flag) { int year, month, day, hour, minute, second, tz; int crtime; year = p[0]; month = p[1]; day = p[2]; hour = p[3]; minute = p[4]; second = p[5]; if (flag == 0) tz = p[6]; /* High sierra has no time zone */ else tz = 0; if (year < 0) { crtime = 0; } else { crtime = mktime64(year+1900, month, day, hour, minute, second); /* sign extend */ if (tz & 0x80) tz |= (-1 << 8); /* * The timezone offset is unreliable on some disks, * so we make a sanity check. In no case is it ever * more than 13 hours from GMT, which is 52*15min. * The time is always stored in localtime with the * timezone offset being what get added to GMT to * get to localtime. Thus we need to subtract the offset * to get to true GMT, which is what we store the time * as internally. On the local system, the user may set * their timezone any way they wish, of course, so GMT * gets converted back to localtime on the receiving * system. * * NOTE: mkisofs in versions prior to mkisofs-1.10 had * the sign wrong on the timezone offset. This has now * been corrected there too, but if you are getting screwy * results this may be the explanation. If enough people * complain, a user configuration option could be added * to add the timezone offset in with the wrong sign * for 'compatibility' with older discs, but I cannot see how * it will matter that much. * * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann) * for pointing out the sign error. */ if (-52 <= tz && tz <= 52) crtime -= tz * 15 * 60; } return crtime; } |
3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 8 6 6 6 6 6 5 3 1 2 2 8 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 | /* * Netlink interface for IEEE 802.15.4 stack * * Copyright 2007, 2008 Siemens AG * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Maxim Osipov <maxim.osipov@siemens.com> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/if_arp.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/cfg802154.h> #include <net/af_ieee802154.h> #include <net/ieee802154_netdev.h> #include <net/rtnetlink.h> /* for rtnl_{un,}lock */ #include <linux/nl802154.h> #include "ieee802154.h" #include "rdev-ops.h" #include "core.h" static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct wpan_phy *phy) { void *hdr; int i, pages = 0; uint32_t *buf = kcalloc(32, sizeof(uint32_t), GFP_KERNEL); pr_debug("%s\n", __func__); if (!buf) return -EMSGSIZE; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, IEEE802154_LIST_PHY); if (!hdr) goto out; rtnl_lock(); if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) || nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel)) goto nla_put_failure; for (i = 0; i < 32; i++) { if (phy->supported.channels[i]) buf[pages++] = phy->supported.channels[i] | (i << 27); } if (pages && nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST, pages * sizeof(uint32_t), buf)) goto nla_put_failure; rtnl_unlock(); kfree(buf); genlmsg_end(msg, hdr); return 0; nla_put_failure: rtnl_unlock(); genlmsg_cancel(msg, hdr); out: kfree(buf); return -EMSGSIZE; } int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info) { /* Request for interface name, index, type, IEEE address, * PAN Id, short address */ struct sk_buff *msg; struct wpan_phy *phy; const char *name; int rc = -ENOBUFS; pr_debug("%s\n", __func__); if (!info->attrs[IEEE802154_ATTR_PHY_NAME]) return -EINVAL; name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ phy = wpan_phy_find(name); if (!phy) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out_dev; rc = ieee802154_nl_fill_phy(msg, info->snd_portid, info->snd_seq, 0, phy); if (rc < 0) goto out_free; wpan_phy_put(phy); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_dev: wpan_phy_put(phy); return rc; } struct dump_phy_data { struct sk_buff *skb; struct netlink_callback *cb; int idx, s_idx; }; static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data) { int rc; struct dump_phy_data *data = _data; pr_debug("%s\n", __func__); if (data->idx++ < data->s_idx) return 0; rc = ieee802154_nl_fill_phy(data->skb, NETLINK_CB(data->cb->skb).portid, data->cb->nlh->nlmsg_seq, NLM_F_MULTI, phy); if (rc < 0) { data->idx--; return rc; } return 0; } int ieee802154_dump_phy(struct sk_buff *skb, struct netlink_callback *cb) { struct dump_phy_data data = { .cb = cb, .skb = skb, .s_idx = cb->args[0], .idx = 0, }; pr_debug("%s\n", __func__); wpan_phy_for_each(ieee802154_dump_phy_iter, &data); cb->args[0] = data.idx; return skb->len; } int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct wpan_phy *phy; const char *name; const char *devname; int rc = -ENOBUFS; struct net_device *dev; int type = __IEEE802154_DEV_INVALID; unsigned char name_assign_type; pr_debug("%s\n", __func__); if (!info->attrs[IEEE802154_ATTR_PHY_NAME]) return -EINVAL; name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { devname = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]); if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ name_assign_type = NET_NAME_USER; } else { devname = "wpan%d"; name_assign_type = NET_NAME_ENUM; } if (strlen(devname) >= IFNAMSIZ) return -ENAMETOOLONG; phy = wpan_phy_find(name); if (!phy) return -ENODEV; msg = ieee802154_nl_new_reply(info, 0, IEEE802154_ADD_IFACE); if (!msg) goto out_dev; if (info->attrs[IEEE802154_ATTR_HW_ADDR] && nla_len(info->attrs[IEEE802154_ATTR_HW_ADDR]) != IEEE802154_ADDR_LEN) { rc = -EINVAL; goto nla_put_failure; } if (info->attrs[IEEE802154_ATTR_DEV_TYPE]) { type = nla_get_u8(info->attrs[IEEE802154_ATTR_DEV_TYPE]); if (type >= __IEEE802154_DEV_MAX) { rc = -EINVAL; goto nla_put_failure; } } dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname, name_assign_type, type); if (IS_ERR(dev)) { rc = PTR_ERR(dev); goto nla_put_failure; } dev_hold(dev); if (info->attrs[IEEE802154_ATTR_HW_ADDR]) { struct sockaddr addr; addr.sa_family = ARPHRD_IEEE802154; nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR], IEEE802154_ADDR_LEN); /* strangely enough, some callbacks (inetdev_event) from * dev_set_mac_address require RTNL_LOCK */ rtnl_lock(); rc = dev_set_mac_address(dev, &addr); rtnl_unlock(); if (rc) goto dev_unregister; } if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name)) { rc = -EMSGSIZE; goto nla_put_failure; } dev_put(dev); wpan_phy_put(phy); return ieee802154_nl_reply(msg, info); dev_unregister: rtnl_lock(); /* del_iface must be called with RTNL lock */ rdev_del_virtual_intf_deprecated(wpan_phy_to_rdev(phy), dev); dev_put(dev); rtnl_unlock(); nla_put_failure: nlmsg_free(msg); out_dev: wpan_phy_put(phy); return rc; } int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct wpan_phy *phy; const char *name; int rc; struct net_device *dev; pr_debug("%s\n", __func__); if (!info->attrs[IEEE802154_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]); if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') return -EINVAL; /* name should be null-terminated */ rc = -ENODEV; dev = dev_get_by_name(genl_info_net(info), name); if (!dev) return rc; if (dev->type != ARPHRD_IEEE802154) goto out; phy = dev->ieee802154_ptr->wpan_phy; BUG_ON(!phy); get_device(&phy->dev); rc = -EINVAL; /* phy name is optional, but should be checked if it's given */ if (info->attrs[IEEE802154_ATTR_PHY_NAME]) { struct wpan_phy *phy2; const char *pname = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); if (pname[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0') /* name should be null-terminated */ goto out_dev; phy2 = wpan_phy_find(pname); if (!phy2) goto out_dev; if (phy != phy2) { wpan_phy_put(phy2); goto out_dev; } } rc = -ENOBUFS; msg = ieee802154_nl_new_reply(info, 0, IEEE802154_DEL_IFACE); if (!msg) goto out_dev; rtnl_lock(); rdev_del_virtual_intf_deprecated(wpan_phy_to_rdev(phy), dev); /* We don't have device anymore */ dev_put(dev); dev = NULL; rtnl_unlock(); if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, name)) goto nla_put_failure; wpan_phy_put(phy); return ieee802154_nl_reply(msg, info); nla_put_failure: nlmsg_free(msg); out_dev: wpan_phy_put(phy); out: if (dev) dev_put(dev); return rc; } |
12 1 1 1 5 5 6 1 1 6 5 5 5 5 5 5 5 5 5 5 6 1 6 6 1 5 6 6 9 7 6 1 1 4 6 3 9 3 5 5 4 4 4 9 8 8 8 7 7 7 7 7 7 9 7 5 6 6 6 6 6 6 7 8 8 8 8 8 8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 | /* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) * * Copyright (C) 2013 Terry Lam <vtlam@google.com> * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> */ #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> /* Heavy-Hitter Filter (HHF) * * Principles : * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, * in which the heavy-hitter bucket is served with less weight. * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have * higher share of bandwidth. * * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the * following paper: * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and * Accounting", in ACM SIGCOMM, 2002. * * Conceptually, a multi-stage filter comprises k independent hash functions * and k counter arrays. Packets are indexed into k counter arrays by k hash * functions, respectively. The counters are then increased by the packet sizes. * Therefore, * - For a heavy-hitter flow: *all* of its k array counters must be large. * - For a non-heavy-hitter flow: some of its k array counters can be large * due to hash collision with other small flows; however, with high * probability, not *all* k counters are large. * * By the design of the multi-stage filter algorithm, the false negative rate * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is * susceptible to false positives (non-heavy-hitters mistakenly classified as * heavy-hitters). * Therefore, we also implement the following optimizations to reduce false * positives by avoiding unnecessary increment of the counter values: * - Optimization O1: once a heavy-hitter is identified, its bytes are not * accounted in the array counters. This technique is called "shielding" * in Section 3.3.1 of [EV02]. * - Optimization O2: conservative update of counters * (Section 3.3.2 of [EV02]), * New counter value = max {old counter value, * smallest counter value + packet bytes} * * Finally, we refresh the counters periodically since otherwise the counter * values will keep accumulating. * * Once a flow is classified as heavy-hitter, we also save its per-flow state * in an exact-matching flow table so that its subsequent packets can be * dispatched to the heavy-hitter bucket accordingly. * * * At a high level, this qdisc works as follows: * Given a packet p: * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter * bucket. * - Otherwise, forward p to the multi-stage filter, denoted filter F * + If F decides that p belongs to a non-heavy-hitter flow, then send p * to the non-heavy-hitter bucket. * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, * then set up a new flow entry for the flow-id of p in the table T and * send p to the heavy-hitter bucket. * * In this implementation: * - T is a fixed-size hash-table with 1024 entries. Hash collision is * resolved by linked-list chaining. * - F has four counter arrays, each array containing 1024 32-bit counters. * That means 4 * 1024 * 32 bits = 16KB of memory. * - Since each array in F contains 1024 counters, 10 bits are sufficient to * index into each array. * Hence, instead of having four hash functions, we chop the 32-bit * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is * computed as XOR sum of those three chunks. * - We need to clear the counter arrays periodically; however, directly * memsetting 16KB of memory can lead to cache eviction and unwanted delay. * So by representing each counter by a valid bit, we only need to reset * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. * - The Deficit Round Robin engine is taken from fq_codel implementation * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to * fq_codel_flow in fq_codel implementation. * */ /* Non-configurable parameters */ #define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ #define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ #define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ #define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ #define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ #define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ enum wdrr_bucket_idx { WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ }; #define hhf_time_before(a, b) \ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) /* Heavy-hitter per-flow state */ struct hh_flow_state { u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ u32 hit_timestamp; /* last time heavy-hitter was seen */ struct list_head flowchain; /* chaining under hash collision */ }; /* Weighted Deficit Round Robin (WDRR) scheduler */ struct wdrr_bucket { struct sk_buff *head; struct sk_buff *tail; struct list_head bucketchain; int deficit; }; struct hhf_sched_data { struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; siphash_key_t perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_overlimit; /* number of times max qdisc packet * limit was hit */ struct list_head *hh_flows; /* table T (currently active HHs) */ u32 hh_flows_limit; /* max active HH allocs */ u32 hh_flows_overlimit; /* num of disallowed HH allocs */ u32 hh_flows_total_cnt; /* total admitted HHs */ u32 hh_flows_current_cnt; /* total current HHs */ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays * was reset */ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits * of hhf_arrays */ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ struct list_head new_buckets; /* list of new buckets */ struct list_head old_buckets; /* list of old buckets */ /* Configurable HHF parameters */ u32 hhf_reset_timeout; /* interval to reset counter * arrays in filter F * (default 40ms) */ u32 hhf_admit_bytes; /* counter thresh to classify as * HH (default 128KB). * With these default values, * 128KB / 40ms = 25 Mbps * i.e., we expect to capture HHs * sending > 25 Mbps. */ u32 hhf_evict_timeout; /* aging threshold to evict idle * HHs out of table T. This should * be large enough to avoid * reordering during HH eviction. * (default 1s) */ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs * (default 2, * i.e., non-HH : HH = 2 : 1) */ }; static u32 hhf_time_stamp(void) { return jiffies; } /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow, *next; u32 now = hhf_time_stamp(); if (list_empty(head)) return NULL; list_for_each_entry_safe(flow, next, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) { /* Delete expired heavy-hitters, but preserve one entry * to avoid kzalloc() when next time this slot is hit. */ if (list_is_last(&flow->flowchain, head)) return NULL; list_del(&flow->flowchain); kfree(flow); q->hh_flows_current_cnt--; } else if (flow->hash_id == hash) { return flow; } } return NULL; } /* Returns a flow state entry for a new heavy-hitter. Either reuses an expired * entry or dynamically alloc a new entry. */ static struct hh_flow_state *alloc_new_hh(struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow; u32 now = hhf_time_stamp(); if (!list_empty(head)) { /* Find an expired heavy-hitter flow entry. */ list_for_each_entry(flow, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) return flow; } } if (q->hh_flows_current_cnt >= q->hh_flows_limit) { q->hh_flows_overlimit++; return NULL; } /* Create new entry. */ flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); if (!flow) return NULL; q->hh_flows_current_cnt++; INIT_LIST_HEAD(&flow->flowchain); list_add_tail(&flow->flowchain, head); return flow; } /* Assigns packets to WDRR buckets. Implements a multi-stage filter to * classify heavy-hitters. */ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); u32 tmp_hash, hash; u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; struct hh_flow_state *flow; u32 pkt_len, min_hhf_val; int i; u32 prev; u32 now = hhf_time_stamp(); /* Reset the HHF counter arrays if this is the right time. */ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; if (hhf_time_before(prev, now)) { for (i = 0; i < HHF_ARRAYS_CNT; i++) bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); q->hhf_arrays_reset_timestamp = now; } /* Get hashed flow-id of the skb. */ hash = skb_get_hash_perturb(skb, &q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; flow = seek_list(hash, &q->hh_flows[flow_pos], q); if (flow) { /* found its HH flow */ flow->hit_timestamp = now; return WDRR_BUCKET_FOR_HH; } /* Now pass the packet through the multi-stage filter. */ tmp_hash = hash; xorsum = 0; for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { /* Split the skb_hash into three 10-bit chunks. */ filter_pos[i] = tmp_hash & HHF_BIT_MASK; xorsum ^= filter_pos[i]; tmp_hash >>= HHF_BIT_MASK_LEN; } /* The last chunk is computed as XOR sum of other chunks. */ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; pkt_len = qdisc_pkt_len(skb); min_hhf_val = ~0U; for (i = 0; i < HHF_ARRAYS_CNT; i++) { u32 val; if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { q->hhf_arrays[i][filter_pos[i]] = 0; __set_bit(filter_pos[i], q->hhf_valid_bits[i]); } val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; if (min_hhf_val > val) min_hhf_val = val; } /* Found a new HH iff all counter values > HH admit threshold. */ if (min_hhf_val > q->hhf_admit_bytes) { /* Just captured a new heavy-hitter. */ flow = alloc_new_hh(&q->hh_flows[flow_pos], q); if (!flow) /* memory alloc problem */ return WDRR_BUCKET_FOR_NON_HH; flow->hash_id = hash; flow->hit_timestamp = now; q->hh_flows_total_cnt++; /* By returning without updating counters in q->hhf_arrays, * we implicitly implement "shielding" (see Optimization O1). */ return WDRR_BUCKET_FOR_HH; } /* Conservative update of HHF arrays (see Optimization O2). */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; } return WDRR_BUCKET_FOR_NON_HH; } /* Removes one skb from head of bucket. */ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) { struct sk_buff *skb = bucket->head; bucket->head = skb->next; skb->next = NULL; return skb; } /* Tail-adds skb to bucket. */ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) { if (bucket->head == NULL) bucket->head = skb; else bucket->tail->next = skb; bucket->tail = skb; skb->next = NULL; } static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; /* Always try to drop from heavy-hitters first. */ bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; if (!bucket->head) bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; if (bucket->head) { struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; unsigned int prev_backlog; idx = hhf_classify(skb, sch); bucket = &q->buckets[idx]; bucket_add(bucket, skb); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; /* The logic of new_buckets vs. old_buckets is the same as * new_flows vs. old_flows in the implementation of fq_codel, * i.e., short bursts of non-HHs should have strict priority. */ if (idx == WDRR_BUCKET_FOR_HH) { /* Always move heavy-hitters to old bucket. */ weight = 1; list_add_tail(&bucket->bucketchain, &q->old_buckets); } else { weight = q->hhf_non_hh_weight; list_add_tail(&bucket->bucketchain, &q->new_buckets); } bucket->deficit = weight * q->quantum; } if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. */ if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } static struct sk_buff *hhf_dequeue(struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct wdrr_bucket *bucket; struct list_head *head; begin: head = &q->new_buckets; if (list_empty(head)) { head = &q->old_buckets; if (list_empty(head)) return NULL; } bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); if (bucket->deficit <= 0) { int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? 1 : q->hhf_non_hh_weight; bucket->deficit += weight * q->quantum; list_move_tail(&bucket->bucketchain, &q->old_buckets); goto begin; } if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { /* Force a pass through old_buckets to prevent starvation. */ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) list_move_tail(&bucket->bucketchain, &q->old_buckets); else list_del_init(&bucket->bucketchain); goto begin; } qdisc_bstats_update(sch, skb); bucket->deficit -= qdisc_pkt_len(skb); return skb; } static void hhf_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { kvfree(q->hhf_arrays[i]); kvfree(q->hhf_valid_bits[i]); } if (!q->hh_flows) return; for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; if (list_empty(head)) continue; list_for_each_entry_safe(flow, next, head, flowchain) { list_del(&flow->flowchain); kfree(flow); } } kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, }; static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; if (!opt) return -EINVAL; err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy, NULL); if (err < 0) return err; if (tb[TCA_HHF_QUANTUM]) new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); if (tb[TCA_HHF_NON_HH_WEIGHT]) new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX) return -EINVAL; sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]); q->quantum = new_quantum; q->hhf_non_hh_weight = new_hhf_non_hh_weight; if (tb[TCA_HHF_HH_FLOWS_LIMIT]) q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); q->hhf_reset_timeout = usecs_to_jiffies(us); } if (tb[TCA_HHF_ADMIT_BYTES]) q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); q->hhf_evict_timeout = usecs_to_jiffies(us); } qlen = sch->q.qlen; prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = hhf_dequeue(sch); rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, prev_backlog - sch->qstats.backlog); sch_tree_unlock(sch); return 0; } static int hhf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); int i; sch->limit = 1000; q->quantum = psched_mtu(qdisc_dev(sch)); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); INIT_LIST_HEAD(&q->new_buckets); INIT_LIST_HEAD(&q->old_buckets); /* Configurable HHF parameters */ q->hhf_reset_timeout = HZ / 25; /* 40 ms */ q->hhf_admit_bytes = 131072; /* 128 KB */ q->hhf_evict_timeout = HZ; /* 1 sec */ q->hhf_non_hh_weight = 2; if (opt) { int err = hhf_change(sch, opt, extack); if (err) return err; } if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head), GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) INIT_LIST_HEAD(&q->hh_flows[i]); /* Cap max active HHs at twice len of hh_flows table. */ q->hh_flows_limit = 2 * HH_FLOWS_CNT; q->hh_flows_overlimit = 0; q->hh_flows_total_cnt = 0; q->hh_flows_current_cnt = 0; /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN, sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } q->hhf_arrays_reset_timestamp = hhf_time_stamp(); /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } /* Initialize Weighted DRR buckets. */ for (i = 0; i < WDRR_BUCKET_CNT; i++) { struct wdrr_bucket *bucket = q->buckets + i; INIT_LIST_HEAD(&bucket->bucketchain); } } return 0; } static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) || nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) || nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, jiffies_to_usecs(q->hhf_reset_timeout)) || nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, jiffies_to_usecs(q->hhf_evict_timeout)) || nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight)) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct hhf_sched_data *q = qdisc_priv(sch); struct tc_hhf_xstats st = { .drop_overlimit = q->drop_overlimit, .hh_overlimit = q->hh_flows_overlimit, .hh_tot_count = q->hh_flows_total_cnt, .hh_cur_count = q->hh_flows_current_cnt, }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .id = "hhf", .priv_size = sizeof(struct hhf_sched_data), .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, .change = hhf_change, .dump = hhf_dump, .dump_stats = hhf_dump_stats, .owner = THIS_MODULE, }; static int __init hhf_module_init(void) { return register_qdisc(&hhf_qdisc_ops); } static void __exit hhf_module_exit(void) { unregister_qdisc(&hhf_qdisc_ops); } module_init(hhf_module_init) module_exit(hhf_module_exit) MODULE_AUTHOR("Terry Lam"); MODULE_AUTHOR("Nandita Dukkipati"); MODULE_LICENSE("GPL"); |
7 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 | /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> /* * This routine purges all the queues of frames. */ void ax25_clear_queues(ax25_cb *ax25) { skb_queue_purge(&ax25->write_queue); skb_queue_purge(&ax25->ack_queue); skb_queue_purge(&ax25->reseq_queue); skb_queue_purge(&ax25->frag_queue); } /* * This routine purges the input queue of those frames that have been * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the * SDL diagram. */ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr) { struct sk_buff *skb; /* * Remove all the ack-ed frames from the ack queue. */ if (ax25->va != nr) { while (skb_peek(&ax25->ack_queue) != NULL && ax25->va != nr) { skb = skb_dequeue(&ax25->ack_queue); kfree_skb(skb); ax25->va = (ax25->va + 1) % ax25->modulus; } } } void ax25_requeue_frames(ax25_cb *ax25) { struct sk_buff *skb; /* * Requeue all the un-ack-ed frames on the output queue to be picked * up by ax25_kick called from the timer. This arrangement handles the * possibility of an empty output queue. */ while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL) skb_queue_head(&ax25->write_queue, skb); } /* * Validate that the value of nr is between va and vs. Return true or * false for testing. */ int ax25_validate_nr(ax25_cb *ax25, unsigned short nr) { unsigned short vc = ax25->va; while (vc != ax25->vs) { if (nr == vc) return 1; vc = (vc + 1) % ax25->modulus; } if (nr == ax25->vs) return 1; return 0; } /* * This routine is the centralised routine for parsing the control * information for the different frame formats. */ int ax25_decode(ax25_cb *ax25, struct sk_buff *skb, int *ns, int *nr, int *pf) { unsigned char *frame; int frametype = AX25_ILLEGAL; frame = skb->data; *ns = *nr = *pf = 0; if (ax25->modulus == AX25_MODULUS) { if ((frame[0] & AX25_S) == 0) { frametype = AX25_I; /* I frame - carries NR/NS/PF */ *ns = (frame[0] >> 1) & 0x07; *nr = (frame[0] >> 5) & 0x07; *pf = frame[0] & AX25_PF; } else if ((frame[0] & AX25_U) == 1) { /* S frame - take out PF/NR */ frametype = frame[0] & 0x0F; *nr = (frame[0] >> 5) & 0x07; *pf = frame[0] & AX25_PF; } else if ((frame[0] & AX25_U) == 3) { /* U frame - take out PF */ frametype = frame[0] & ~AX25_PF; *pf = frame[0] & AX25_PF; } skb_pull(skb, 1); } else { if ((frame[0] & AX25_S) == 0) { frametype = AX25_I; /* I frame - carries NR/NS/PF */ *ns = (frame[0] >> 1) & 0x7F; *nr = (frame[1] >> 1) & 0x7F; *pf = frame[1] & AX25_EPF; skb_pull(skb, 2); } else if ((frame[0] & AX25_U) == 1) { /* S frame - take out PF/NR */ frametype = frame[0] & 0x0F; *nr = (frame[1] >> 1) & 0x7F; *pf = frame[1] & AX25_EPF; skb_pull(skb, 2); } else if ((frame[0] & AX25_U) == 3) { /* U frame - take out PF */ frametype = frame[0] & ~AX25_PF; *pf = frame[0] & AX25_PF; skb_pull(skb, 1); } } return frametype; } /* * This routine is called when the HDLC layer internally generates a * command or response for the remote machine ( eg. RR, UA etc. ). * Only supervisory or unnumbered frames are processed. */ void ax25_send_control(ax25_cb *ax25, int frametype, int poll_bit, int type) { struct sk_buff *skb; unsigned char *dptr; if ((skb = alloc_skb(ax25->ax25_dev->dev->hard_header_len + 2, GFP_ATOMIC)) == NULL) return; skb_reserve(skb, ax25->ax25_dev->dev->hard_header_len); skb_reset_network_header(skb); /* Assume a response - address structure for DTE */ if (ax25->modulus == AX25_MODULUS) { dptr = skb_put(skb, 1); *dptr = frametype; *dptr |= (poll_bit) ? AX25_PF : 0; if ((frametype & AX25_U) == AX25_S) /* S frames carry NR */ *dptr |= (ax25->vr << 5); } else { if ((frametype & AX25_U) == AX25_U) { dptr = skb_put(skb, 1); *dptr = frametype; *dptr |= (poll_bit) ? AX25_PF : 0; } else { dptr = skb_put(skb, 2); dptr[0] = frametype; dptr[1] = (ax25->vr << 1); dptr[1] |= (poll_bit) ? AX25_EPF : 0; } } ax25_transmit_buffer(ax25, skb, type); } /* * Send a 'DM' to an unknown connection attempt, or an invalid caller. * * Note: src here is the sender, thus it's the target of the DM */ void ax25_return_dm(struct net_device *dev, ax25_address *src, ax25_address *dest, ax25_digi *digi) { struct sk_buff *skb; char *dptr; ax25_digi retdigi; if (dev == NULL) return; if ((skb = alloc_skb(dev->hard_header_len + 1, GFP_ATOMIC)) == NULL) return; /* Next SABM will get DM'd */ skb_reserve(skb, dev->hard_header_len); skb_reset_network_header(skb); ax25_digi_invert(digi, &retdigi); dptr = skb_put(skb, 1); *dptr = AX25_DM | AX25_PF; /* * Do the address ourselves */ dptr = skb_push(skb, ax25_addr_size(digi)); dptr += ax25_addr_build(dptr, dest, src, &retdigi, AX25_RESPONSE, AX25_MODULUS); ax25_queue_xmit(skb, dev); } /* * Exponential backoff for AX.25 */ void ax25_calculate_t1(ax25_cb *ax25) { int n, t = 2; switch (ax25->backoff) { case 0: break; case 1: t += 2 * ax25->n2count; break; case 2: for (n = 0; n < ax25->n2count; n++) t *= 2; if (t > 8) t = 8; break; } ax25->t1 = t * ax25->rtt; } /* * Calculate the Round Trip Time */ void ax25_calculate_rtt(ax25_cb *ax25) { if (ax25->backoff == 0) return; if (ax25_t1timer_running(ax25) && ax25->n2count == 0) ax25->rtt = (9 * ax25->rtt + ax25->t1 - ax25_display_timer(&ax25->t1timer)) / 10; if (ax25->rtt < AX25_T1CLAMPLO) ax25->rtt = AX25_T1CLAMPLO; if (ax25->rtt > AX25_T1CLAMPHI) ax25->rtt = AX25_T1CLAMPHI; } void ax25_disconnect(ax25_cb *ax25, int reason) { ax25_clear_queues(ax25); if (reason == ENETUNREACH) { del_timer_sync(&ax25->timer); del_timer_sync(&ax25->t1timer); del_timer_sync(&ax25->t2timer); del_timer_sync(&ax25->t3timer); del_timer_sync(&ax25->idletimer); } else { if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY)) ax25_stop_heartbeat(ax25); ax25_stop_t1timer(ax25); ax25_stop_t2timer(ax25); ax25_stop_t3timer(ax25); ax25_stop_idletimer(ax25); } ax25->state = AX25_STATE_0; ax25_link_failed(ax25, reason); if (ax25->sk != NULL) { local_bh_disable(); bh_lock_sock(ax25->sk); ax25->sk->sk_state = TCP_CLOSE; ax25->sk->sk_err = reason; ax25->sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(ax25->sk, SOCK_DEAD)) { ax25->sk->sk_state_change(ax25->sk); sock_set_flag(ax25->sk, SOCK_DEAD); } bh_unlock_sock(ax25->sk); local_bh_enable(); } } |
13 13 13 13 13 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 | /* * net/tipc/discover.c * * Copyright (c) 2003-2006, 2014-2018, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "node.h" #include "discover.h" /* min delay during bearer start up */ #define TIPC_DISC_INIT msecs_to_jiffies(125) /* max delay if bearer has no links */ #define TIPC_DISC_FAST msecs_to_jiffies(1000) /* max delay if bearer has links */ #define TIPC_DISC_SLOW msecs_to_jiffies(60000) /* indicates no timer in use */ #define TIPC_DISC_INACTIVE 0xffffffff /** * struct tipc_discoverer - information about an ongoing link setup request * @bearer_id: identity of bearer issuing requests * @net: network namespace instance * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) * @lock: spinlock for controlling access to requests * @skb: request message to be (repeatedly) sent * @timer: timer governing period between requests * @timer_intv: current interval between requests (in ms) */ struct tipc_discoverer { u32 bearer_id; struct tipc_media_addr dest; struct net *net; u32 domain; int num_nodes; spinlock_t lock; struct sk_buff *skb; struct timer_list timer; unsigned long timer_intv; }; /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace * @type: message type (request or response) * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, u32 mtyp, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); u32 dest_domain = b->domain; struct tipc_msg *hdr; hdr = buf_msg(skb); tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN); msg_set_non_seq(hdr, 1); msg_set_node_sig(hdr, tn->random); msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES); msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); msg_set_node_id(hdr, tipc_own_id(net)); } static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, u32 sugg_addr, struct tipc_media_addr *maddr, struct tipc_bearer *b) { struct tipc_msg *hdr; struct sk_buff *skb; skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!skb) return; hdr = buf_msg(skb); tipc_disc_init_msg(net, skb, mtyp, b); msg_set_sugg_node_addr(hdr, sugg_addr); msg_set_dest_domain(hdr, dst); tipc_bearer_xmit_skb(net, b->identity, skb, maddr); } /** * disc_dupl_alert - issue node address duplication alert * @b: pointer to bearer detecting duplication * @node_addr: duplicated node address * @media_addr: media address advertised by duplicated node */ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { char media_addr_str[64]; tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr, media_addr_str, b->name); } /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer * Returns true if message should be dropped by caller, i.e., if it is a * trial message or we are inside trial period. Otherwise false. */ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, struct tipc_media_addr *maddr, struct tipc_bearer *b, u32 dst, u32 src, u32 sugg_addr, u8 *peer_id, int mtyp) { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); u32 self = tipc_own_addr(net); bool trial = time_before(jiffies, tn->addr_trial_end) && !self; if (mtyp == DSC_TRIAL_FAIL_MSG) { if (!trial) return true; /* Ignore if somebody else already gave new suggestion */ if (dst != tn->trial_addr) return true; /* Otherwise update trial address and restart trial period */ tn->trial_addr = sugg_addr; msg_set_prevnode(buf_msg(d->skb), sugg_addr); tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); return true; } /* Apply trial address if we just left trial period */ if (!trial && !self) { tipc_sched_net_finalize(net, tn->trial_addr); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } /* Accept regular link requests/responses only after trial period */ if (mtyp != DSC_TRIAL_MSG) return trial; sugg_addr = tipc_node_try_addr(net, peer_id, src); if (sugg_addr) tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src, self, sugg_addr, maddr, b); return true; } /** * tipc_disc_rcv - handle incoming discovery message (request or response) * @net: applicable net namespace * @skb: buffer containing message * @b: bearer that message arrived on */ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); u32 signature = msg_node_sig(hdr); u8 peer_id[NODE_ID_LEN] = {0,}; u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); struct tipc_media_addr maddr; u32 src = msg_prevnode(hdr); u32 mtyp = msg_type(hdr); bool dupl_addr = false; bool respond = false; u32 self; int err; if (skb_linearize(skb)) { kfree_skb(skb); return; } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN); else sprintf(peer_id, "%x", src); err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr)); kfree_skb(skb); if (err || maddr.broadcast) { pr_warn_ratelimited("Rcv corrupt discovery message\n"); return; } /* Ignore discovery messages from own node */ if (!memcmp(&maddr, &b->addr, sizeof(maddr))) return; if (net_id != tn->net_id) return; if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst, src, sugg, peer_id, mtyp)) return; self = tipc_own_addr(net); /* Message from somebody using this node's address */ if (in_own_node(net, src)) { disc_dupl_alert(b, self, &maddr); return; } if (!tipc_in_scope(legacy, dst, self)) return; if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) return; if (mtyp != DSC_REQ_MSG) return; tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b); } /* tipc_disc_add_dest - increment set of discovered nodes */ void tipc_disc_add_dest(struct tipc_discoverer *d) { spin_lock_bh(&d->lock); d->num_nodes++; spin_unlock_bh(&d->lock); } /* tipc_disc_remove_dest - decrement set of discovered nodes */ void tipc_disc_remove_dest(struct tipc_discoverer *d) { int intv, num; spin_lock_bh(&d->lock); d->num_nodes--; num = d->num_nodes; intv = d->timer_intv; if (!num && (intv == TIPC_DISC_INACTIVE || intv > TIPC_DISC_FAST)) { d->timer_intv = TIPC_DISC_INIT; mod_timer(&d->timer, jiffies + d->timer_intv); } spin_unlock_bh(&d->lock); } /* tipc_disc_timeout - send a periodic link setup request * Called whenever a link setup request timer associated with a bearer expires. * - Keep doubling time between sent request until limit is reached; * - Hold at fast polling rate if we don't have any associated nodes * - Otherwise hold at slow polling rate */ static void tipc_disc_timeout(struct timer_list *t) { struct tipc_discoverer *d = from_timer(d, t, timer); struct tipc_net *tn = tipc_net(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; struct net *net = d->net; u32 bearer_id; spin_lock_bh(&d->lock); /* Stop searching if only desired node has been found */ if (tipc_node(d->domain) && d->num_nodes) { d->timer_intv = TIPC_DISC_INACTIVE; goto exit; } /* Did we just leave trial period ? */ if (!time_before(jiffies, tn->addr_trial_end) && !tipc_own_addr(net)) { mod_timer(&d->timer, jiffies + TIPC_DISC_INIT); spin_unlock_bh(&d->lock); tipc_sched_net_finalize(net, tn->trial_addr); return; } /* Adjust timeout interval according to discovery phase */ if (time_before(jiffies, tn->addr_trial_end)) { d->timer_intv = TIPC_DISC_INIT; } else { d->timer_intv *= 2; if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) d->timer_intv = TIPC_DISC_SLOW; else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) d->timer_intv = TIPC_DISC_FAST; msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); } mod_timer(&d->timer, jiffies + d->timer_intv); memcpy(&maddr, &d->dest, sizeof(maddr)); skb = skb_clone(d->skb, GFP_ATOMIC); bearer_id = d->bearer_id; exit: spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &maddr); } /** * tipc_disc_create - create object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests * @dest: destination address for request messages * @dest_domain: network domain to which links can be established * * Returns 0 if successful, otherwise -errno. */ int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) { struct tipc_net *tn = tipc_net(net); struct tipc_discoverer *d; d = kmalloc(sizeof(*d), GFP_ATOMIC); if (!d) return -ENOMEM; d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!d->skb) { kfree(d); return -ENOMEM; } tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); /* Do we need an address trial period first ? */ if (!tipc_own_addr(net)) { tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG); } memcpy(&d->dest, dest, sizeof(*dest)); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; spin_lock_init(&d->lock); timer_setup(&d->timer, tipc_disc_timeout, 0); mod_timer(&d->timer, jiffies + d->timer_intv); b->disc = d; *skb = skb_clone(d->skb, GFP_ATOMIC); return 0; } /** * tipc_disc_delete - destroy object sending periodic link setup requests * @d: ptr to link duest structure */ void tipc_disc_delete(struct tipc_discoverer *d) { del_timer_sync(&d->timer); kfree_skb(d->skb); kfree(d); } /** * tipc_disc_reset - reset object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests * @dest_domain: network domain to which links can be established */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { struct tipc_discoverer *d = b->disc; struct tipc_media_addr maddr; struct sk_buff *skb; spin_lock_bh(&d->lock); tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; memcpy(&maddr, &d->dest, sizeof(maddr)); mod_timer(&d->timer, jiffies + d->timer_intv); skb = skb_clone(d->skb, GFP_ATOMIC); spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, b->identity, skb, &maddr); } |
27 48 172 207 190 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BMAP_H__ #define __XFS_BMAP_H__ struct getbmap; struct xfs_bmbt_irec; struct xfs_ifork; struct xfs_inode; struct xfs_mount; struct xfs_trans; extern kmem_zone_t *xfs_bmap_free_item_zone; /* * Argument structure for xfs_bmap_alloc. */ struct xfs_bmalloca { struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct xfs_bmbt_irec prev; /* extent before the new one */ struct xfs_bmbt_irec got; /* extent after, or delayed */ xfs_fileoff_t offset; /* offset in file filling in */ xfs_extlen_t length; /* i/o length asked/allocated */ xfs_fsblock_t blkno; /* starting block of new extent */ struct xfs_btree_cur *cur; /* btree cursor */ struct xfs_iext_cursor icur; /* incore extent cursor */ int nallocs;/* number of extents alloc'd */ int logflags;/* flags for transaction logging */ xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ bool eof; /* set if allocating past last extent */ bool wasdel; /* replacing a delayed allocation */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ int flags; }; /* * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ struct xfs_extent_free_item { xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ bool xefi_skip_discard; struct list_head xefi_list; struct xfs_owner_info xefi_oinfo; /* extent owner */ }; #define XFS_BMAP_MAX_NMAP 4 /* * Flags for xfs_bmapi_* */ #define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ #define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ #define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ #define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ #define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT 0x040 /* * allocate zeroed extents - this requires all newly allocated user data extents * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set. * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ #define XFS_BMAPI_ZERO 0x080 /* * Map the inode offset to the block given in ap->firstblock. Primarily * used for reflink. The range must be in a hole, and this flag cannot be * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork. * * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ #define XFS_BMAPI_REMAP 0x100 /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 /* Only convert delalloc space, don't allocate entirely new extents */ #define XFS_BMAPI_DELALLOC 0x400 /* Only convert unwritten extents, don't allocate new blocks */ #define XFS_BMAPI_CONVERT_ONLY 0x800 /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD 0x1000 /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP 0x2000 #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ { XFS_BMAPI_CONVERT, "CONVERT" }, \ { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } static inline int xfs_bmapi_aflag(int w) { return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } static inline int xfs_bmapi_whichfork(int bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; else if (bmapi_flags & XFS_BMAPI_ATTRFORK) return XFS_ATTR_FORK; return XFS_DATA_FORK; } /* * Special values for xfs_bmbt_irec_t br_startblock field. */ #define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) #define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) /* * Flags for xfs_bmap_add_extent*. */ #define BMAP_LEFT_CONTIG (1 << 0) #define BMAP_RIGHT_CONTIG (1 << 1) #define BMAP_LEFT_FILLING (1 << 2) #define BMAP_RIGHT_FILLING (1 << 3) #define BMAP_LEFT_DELAY (1 << 4) #define BMAP_RIGHT_DELAY (1 << 5) #define BMAP_LEFT_VALID (1 << 6) #define BMAP_RIGHT_VALID (1 << 7) #define BMAP_ATTRFORK (1 << 8) #define BMAP_COWFORK (1 << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ { BMAP_RIGHT_CONTIG, "RC" }, \ { BMAP_LEFT_FILLING, "LF" }, \ { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" }, \ { BMAP_COWFORK, "COW" } /* * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) { return irec->br_state != XFS_EXT_UNWRITTEN && irec->br_startblock != HOLESTARTBLOCK && irec->br_startblock != DELAYSTARTBLOCK && !isnullstartblock(irec->br_startblock); } void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_owner_info *oinfo, bool skip_discard); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork); int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, int *done); int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done); int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, xfs_fileoff_t shift); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); static inline void xfs_bmap_add_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_owner_info *oinfo) { __xfs_bmap_add_free(tp, bno, len, oinfo, false); } enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, }; struct xfs_bmap_intent { struct list_head bi_list; enum xfs_bmap_intent_type bi_type; struct xfs_inode *bi_owner; int bi_whichfork; struct xfs_bmbt_irec bi_bmap; }; int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_inode *ip, enum xfs_bmap_intent_type type, int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, xfs_filblks_t *blockcount, xfs_exntst_t state); int xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); int xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); static inline int xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: return BMAP_ATTRFORK; case XFS_COW_FORK: return BMAP_COWFORK; default: return 0; } } xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, int flags); #endif /* __XFS_BMAP_H__ */ |
8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | /* * linux/fs/hfs/hfs_fs.h * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. */ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/types.h> #include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/workqueue.h> #include <asm/byteorder.h> #include <linux/uaccess.h> #include "hfs.h" #define DBG_BNODE_REFS 0x00000001 #define DBG_BNODE_MOD 0x00000002 #define DBG_CAT_MOD 0x00000004 #define DBG_INODE 0x00000008 #define DBG_SUPER 0x00000010 #define DBG_EXTENT 0x00000020 #define DBG_BITMAP 0x00000040 //#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP) //#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) //#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) #define DBG_MASK (0) #define hfs_dbg(flg, fmt, ...) \ do { \ if (DBG_##flg & DBG_MASK) \ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ } while (0) #define hfs_dbg_cont(flg, fmt, ...) \ do { \ if (DBG_##flg & DBG_MASK) \ pr_cont(fmt, ##__VA_ARGS__); \ } while (0) /* * struct hfs_inode_info * * The HFS-specific part of a Linux (struct inode) */ struct hfs_inode_info { atomic_t opencnt; unsigned int flags; /* to deal with localtime ugliness */ int tz_secondswest; struct hfs_cat_key cat_key; struct list_head open_dir_list; spinlock_t open_dir_lock; struct inode *rsrc_inode; struct mutex extents_lock; u16 alloc_blocks, clump_blocks; sector_t fs_blocks; /* Allocation extents from catlog record or volume header */ hfs_extent_rec first_extents; u16 first_blocks; hfs_extent_rec cached_extents; u16 cached_start, cached_blocks; loff_t phys_size; struct inode vfs_inode; }; #define HFS_FLG_RSRC 0x0001 #define HFS_FLG_EXT_DIRTY 0x0002 #define HFS_FLG_EXT_NEW 0x0004 #define HFS_IS_RSRC(inode) (HFS_I(inode)->flags & HFS_FLG_RSRC) /* * struct hfs_sb_info * * The HFS-specific part of a Linux (struct super_block) */ struct hfs_sb_info { struct buffer_head *mdb_bh; /* The hfs_buffer holding the real superblock (aka VIB or MDB) */ struct hfs_mdb *mdb; struct buffer_head *alt_mdb_bh; /* The hfs_buffer holding the alternate superblock */ struct hfs_mdb *alt_mdb; __be32 *bitmap; /* The page holding the allocation bitmap */ struct hfs_btree *ext_tree; /* Information about the extents b-tree */ struct hfs_btree *cat_tree; /* Information about the catalog b-tree */ u32 file_count; /* The number of regular files in the filesystem */ u32 folder_count; /* The number of directories in the filesystem */ u32 next_id; /* The next available file id number */ u32 clumpablks; /* The number of allocation blocks to try to add when extending a file */ u32 fs_start; /* The first 512-byte block represented in the bitmap */ u32 part_start; u16 root_files; /* The number of regular (non-directory) files in the root directory */ u16 root_dirs; /* The number of directories in the root directory */ u16 fs_ablocks; /* The number of allocation blocks in the filesystem */ u16 free_ablocks; /* the number of unused allocation blocks in the filesystem */ u32 alloc_blksz; /* The size of an "allocation block" */ int s_quiet; /* Silent failure when changing owner or mode? */ __be32 s_type; /* Type for new files */ __be32 s_creator; /* Creator for new files */ umode_t s_file_umask; /* The umask applied to the permissions on all files */ umode_t s_dir_umask; /* The umask applied to the permissions on all dirs */ kuid_t s_uid; /* The uid of all files */ kgid_t s_gid; /* The gid of all files */ int session, part; struct nls_table *nls_io, *nls_disk; struct mutex bitmap_lock; unsigned long flags; u16 blockoffset; int fs_div; struct super_block *sb; int work_queued; /* non-zero delayed work is queued */ struct delayed_work mdb_work; /* MDB flush delayed work */ spinlock_t work_lock; /* protects mdb_work and work_queued */ }; #define HFS_FLG_BITMAP_DIRTY 0 #define HFS_FLG_MDB_DIRTY 1 #define HFS_FLG_ALT_MDB_DIRTY 2 /* bitmap.c */ extern u32 hfs_vbm_search_free(struct super_block *, u32, u32 *); extern int hfs_clear_vbm_bits(struct super_block *, u16, u16); /* catalog.c */ extern int hfs_cat_keycmp(const btree_key *, const btree_key *); struct hfs_find_data; extern int hfs_cat_find_brec(struct super_block *, u32, struct hfs_find_data *); extern int hfs_cat_create(u32, struct inode *, const struct qstr *, struct inode *); extern int hfs_cat_delete(u32, struct inode *, const struct qstr *); extern int hfs_cat_move(u32, struct inode *, const struct qstr *, struct inode *, const struct qstr *); extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, const struct qstr *); /* dir.c */ extern const struct file_operations hfs_dir_operations; extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ extern int hfs_ext_keycmp(const btree_key *, const btree_key *); extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int); extern int hfs_ext_write_extent(struct inode *); extern int hfs_extend_file(struct inode *); extern void hfs_file_truncate(struct inode *); extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int); /* inode.c */ extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); extern int hfs_inode_setattr(struct dentry *, struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *); extern void hfs_evict_inode(struct inode *); extern void hfs_delete_inode(struct inode *); /* attr.c */ extern const struct xattr_handler *hfs_xattr_handlers[]; /* mdb.c */ extern int hfs_mdb_get(struct super_block *); extern void hfs_mdb_commit(struct super_block *); extern void hfs_mdb_close(struct super_block *); extern void hfs_mdb_put(struct super_block *); /* part_tbl.c */ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *); /* string.c */ extern const struct dentry_operations hfs_dentry_operations; extern int hfs_hash_dentry(const struct dentry *, struct qstr *); extern int hfs_strcmp(const unsigned char *, unsigned int, const unsigned char *, unsigned int); extern int hfs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* trans.c */ extern void hfs_asc2mac(struct super_block *, struct hfs_name *, const struct qstr *); extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *); /* super.c */ extern void hfs_mark_mdb_dirty(struct super_block *sb); /* * There are two time systems. Both are based on seconds since * a particular time/date. * Unix: unsigned lil-endian since 00:00 GMT, Jan. 1, 1970 * mac: unsigned big-endian since 00:00 GMT, Jan. 1, 1904 * */ #define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) #define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) #define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) #define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } #define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) #define hfs_mtime() __hfs_u_to_mtime(get_seconds()) static inline const char *hfs_mdb_name(struct super_block *sb) { return sb->s_id; } static inline void hfs_bitmap_dirty(struct super_block *sb) { set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); } #define sb_bread512(sb, sec, data) ({ \ struct buffer_head *__bh; \ sector_t __block; \ loff_t __start; \ int __offset; \ \ __start = (loff_t)(sec) << HFS_SECTOR_SIZE_BITS;\ __block = __start >> (sb)->s_blocksize_bits; \ __offset = __start & ((sb)->s_blocksize - 1); \ __bh = sb_bread((sb), __block); \ if (likely(__bh != NULL)) \ data = (void *)(__bh->b_data + __offset);\ else \ data = NULL; \ __bh; \ }) #endif |
1 1 7 7 7 5 5 3 8 7 7 5 5 5 6 2 2 2 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 | /* * This program is free software; you can distribute it and/or modify it * under the terms of the GNU General Public License (Version 2) as * published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, see <http://www.gnu.org/licenses/>. * * Copyright (C) Hans Alblas PE1AYX <hans@esrac.ele.tue.nl> * Copyright (C) 2004, 05 Ralf Baechle DL5RB <ralf@linux-mips.org> * Copyright (C) 2004, 05 Thomas Osterried DL9SAU <thomas@x-berg.in-berlin.de> */ #include <linux/module.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/crc16.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/major.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/jiffies.h> #include <linux/compat.h> #include <net/ax25.h> #define AX_MTU 236 /* some arch define END as assembly function ending, just undef it */ #undef END /* SLIP/KISS protocol characters. */ #define END 0300 /* indicates end of frame */ #define ESC 0333 /* indicates byte stuffing */ #define ESC_END 0334 /* ESC ESC_END means END 'data' */ #define ESC_ESC 0335 /* ESC ESC_ESC means ESC 'data' */ struct mkiss { struct tty_struct *tty; /* ptr to TTY structure */ struct net_device *dev; /* easy for intr handling */ /* These are pointers to the malloc()ed frame buffers. */ spinlock_t buflock;/* lock for rbuf and xbuf */ unsigned char *rbuff; /* receiver buffer */ int rcount; /* received chars counter */ unsigned char *xbuff; /* transmitter buffer */ unsigned char *xhead; /* pointer to next byte to XMIT */ int xleft; /* bytes left in XMIT queue */ /* Detailed SLIP statistics. */ int mtu; /* Our mtu (to spot changes!) */ int buffsize; /* Max buffers sizes */ unsigned long flags; /* Flag values/ mode etc */ /* long req'd: used by set_bit --RR */ #define AXF_INUSE 0 /* Channel in use */ #define AXF_ESCAPE 1 /* ESC received */ #define AXF_ERROR 2 /* Parity, etc. error */ #define AXF_KEEPTEST 3 /* Keepalive test flag */ #define AXF_OUTWAIT 4 /* is outpacket was flag */ int mode; int crcmode; /* MW: for FlexNet, SMACK etc. */ int crcauto; /* CRC auto mode */ #define CRC_MODE_NONE 0 #define CRC_MODE_FLEX 1 #define CRC_MODE_SMACK 2 #define CRC_MODE_FLEX_TEST 3 #define CRC_MODE_SMACK_TEST 4 atomic_t refcnt; struct semaphore dead_sem; }; /*---------------------------------------------------------------------------*/ static const unsigned short crc_flex_table[] = { 0x0f87, 0x1e0e, 0x2c95, 0x3d1c, 0x49a3, 0x582a, 0x6ab1, 0x7b38, 0x83cf, 0x9246, 0xa0dd, 0xb154, 0xc5eb, 0xd462, 0xe6f9, 0xf770, 0x1f06, 0x0e8f, 0x3c14, 0x2d9d, 0x5922, 0x48ab, 0x7a30, 0x6bb9, 0x934e, 0x82c7, 0xb05c, 0xa1d5, 0xd56a, 0xc4e3, 0xf678, 0xe7f1, 0x2e85, 0x3f0c, 0x0d97, 0x1c1e, 0x68a1, 0x7928, 0x4bb3, 0x5a3a, 0xa2cd, 0xb344, 0x81df, 0x9056, 0xe4e9, 0xf560, 0xc7fb, 0xd672, 0x3e04, 0x2f8d, 0x1d16, 0x0c9f, 0x7820, 0x69a9, 0x5b32, 0x4abb, 0xb24c, 0xa3c5, 0x915e, 0x80d7, 0xf468, 0xe5e1, 0xd77a, 0xc6f3, 0x4d83, 0x5c0a, 0x6e91, 0x7f18, 0x0ba7, 0x1a2e, 0x28b5, 0x393c, 0xc1cb, 0xd042, 0xe2d9, 0xf350, 0x87ef, 0x9666, 0xa4fd, 0xb574, 0x5d02, 0x4c8b, 0x7e10, 0x6f99, 0x1b26, 0x0aaf, 0x3834, 0x29bd, 0xd14a, 0xc0c3, 0xf258, 0xe3d1, 0x976e, 0x86e7, 0xb47c, 0xa5f5, 0x6c81, 0x7d08, 0x4f93, 0x5e1a, 0x2aa5, 0x3b2c, 0x09b7, 0x183e, 0xe0c9, 0xf140, 0xc3db, 0xd252, 0xa6ed, 0xb764, 0x85ff, 0x9476, 0x7c00, 0x6d89, 0x5f12, 0x4e9b, 0x3a24, 0x2bad, 0x1936, 0x08bf, 0xf048, 0xe1c1, 0xd35a, 0xc2d3, 0xb66c, 0xa7e5, 0x957e, 0x84f7, 0x8b8f, 0x9a06, 0xa89d, 0xb914, 0xcdab, 0xdc22, 0xeeb9, 0xff30, 0x07c7, 0x164e, 0x24d5, 0x355c, 0x41e3, 0x506a, 0x62f1, 0x7378, 0x9b0e, 0x8a87, 0xb81c, 0xa995, 0xdd2a, 0xcca3, 0xfe38, 0xefb1, 0x1746, 0x06cf, 0x3454, 0x25dd, 0x5162, 0x40eb, 0x7270, 0x63f9, 0xaa8d, 0xbb04, 0x899f, 0x9816, 0xeca9, 0xfd20, 0xcfbb, 0xde32, 0x26c5, 0x374c, 0x05d7, 0x145e, 0x60e1, 0x7168, 0x43f3, 0x527a, 0xba0c, 0xab85, 0x991e, 0x8897, 0xfc28, 0xeda1, 0xdf3a, 0xceb3, 0x3644, 0x27cd, 0x1556, 0x04df, 0x7060, 0x61e9, 0x5372, 0x42fb, 0xc98b, 0xd802, 0xea99, 0xfb10, 0x8faf, 0x9e26, 0xacbd, 0xbd34, 0x45c3, 0x544a, 0x66d1, 0x7758, 0x03e7, 0x126e, 0x20f5, 0x317c, 0xd90a, 0xc883, 0xfa18, 0xeb91, 0x9f2e, 0x8ea7, 0xbc3c, 0xadb5, 0x5542, 0x44cb, 0x7650, 0x67d9, 0x1366, 0x02ef, 0x3074, 0x21fd, 0xe889, 0xf900, 0xcb9b, 0xda12, 0xaead, 0xbf24, 0x8dbf, 0x9c36, 0x64c1, 0x7548, 0x47d3, 0x565a, 0x22e5, 0x336c, 0x01f7, 0x107e, 0xf808, 0xe981, 0xdb1a, 0xca93, 0xbe2c, 0xafa5, 0x9d3e, 0x8cb7, 0x7440, 0x65c9, 0x5752, 0x46db, 0x3264, 0x23ed, 0x1176, 0x00ff }; static unsigned short calc_crc_flex(unsigned char *cp, int size) { unsigned short crc = 0xffff; while (size--) crc = (crc << 8) ^ crc_flex_table[((crc >> 8) ^ *cp++) & 0xff]; return crc; } static int check_crc_flex(unsigned char *cp, int size) { unsigned short crc = 0xffff; if (size < 3) return -1; while (size--) crc = (crc << 8) ^ crc_flex_table[((crc >> 8) ^ *cp++) & 0xff]; if ((crc & 0xffff) != 0x7070) return -1; return 0; } static int check_crc_16(unsigned char *cp, int size) { unsigned short crc = 0x0000; if (size < 3) return -1; crc = crc16(0, cp, size); if (crc != 0x0000) return -1; return 0; } /* * Standard encapsulation */ static int kiss_esc(unsigned char *s, unsigned char *d, int len) { unsigned char *ptr = d; unsigned char c; /* * Send an initial END character to flush out any data that may have * accumulated in the receiver due to line noise. */ *ptr++ = END; while (len-- > 0) { switch (c = *s++) { case END: *ptr++ = ESC; *ptr++ = ESC_END; break; case ESC: *ptr++ = ESC; *ptr++ = ESC_ESC; break; default: *ptr++ = c; break; } } *ptr++ = END; return ptr - d; } /* * MW: * OK its ugly, but tell me a better solution without copying the * packet to a temporary buffer :-) */ static int kiss_esc_crc(unsigned char *s, unsigned char *d, unsigned short crc, int len) { unsigned char *ptr = d; unsigned char c=0; *ptr++ = END; while (len > 0) { if (len > 2) c = *s++; else if (len > 1) c = crc >> 8; else c = crc & 0xff; len--; switch (c) { case END: *ptr++ = ESC; *ptr++ = ESC_END; break; case ESC: *ptr++ = ESC; *ptr++ = ESC_ESC; break; default: *ptr++ = c; break; } } *ptr++ = END; return ptr - d; } /* Send one completely decapsulated AX.25 packet to the AX.25 layer. */ static void ax_bump(struct mkiss *ax) { struct sk_buff *skb; int count; spin_lock_bh(&ax->buflock); if (ax->rbuff[0] > 0x0f) { if (ax->rbuff[0] & 0x80) { if (check_crc_16(ax->rbuff, ax->rcount) < 0) { ax->dev->stats.rx_errors++; spin_unlock_bh(&ax->buflock); return; } if (ax->crcmode != CRC_MODE_SMACK && ax->crcauto) { printk(KERN_INFO "mkiss: %s: Switching to crc-smack\n", ax->dev->name); ax->crcmode = CRC_MODE_SMACK; } ax->rcount -= 2; *ax->rbuff &= ~0x80; } else if (ax->rbuff[0] & 0x20) { if (check_crc_flex(ax->rbuff, ax->rcount) < 0) { ax->dev->stats.rx_errors++; spin_unlock_bh(&ax->buflock); return; } if (ax->crcmode != CRC_MODE_FLEX && ax->crcauto) { printk(KERN_INFO "mkiss: %s: Switching to crc-flexnet\n", ax->dev->name); ax->crcmode = CRC_MODE_FLEX; } ax->rcount -= 2; /* * dl9sau bugfix: the trailling two bytes flexnet crc * will not be passed to the kernel. thus we have to * correct the kissparm signature, because it indicates * a crc but there's none */ *ax->rbuff &= ~0x20; } } count = ax->rcount; if ((skb = dev_alloc_skb(count)) == NULL) { printk(KERN_ERR "mkiss: %s: memory squeeze, dropping packet.\n", ax->dev->name); ax->dev->stats.rx_dropped++; spin_unlock_bh(&ax->buflock); return; } skb_put_data(skb, ax->rbuff, count); skb->protocol = ax25_type_trans(skb, ax->dev); netif_rx(skb); ax->dev->stats.rx_packets++; ax->dev->stats.rx_bytes += count; spin_unlock_bh(&ax->buflock); } static void kiss_unesc(struct mkiss *ax, unsigned char s) { switch (s) { case END: /* drop keeptest bit = VSV */ if (test_bit(AXF_KEEPTEST, &ax->flags)) clear_bit(AXF_KEEPTEST, &ax->flags); if (!test_and_clear_bit(AXF_ERROR, &ax->flags) && (ax->rcount > 2)) ax_bump(ax); clear_bit(AXF_ESCAPE, &ax->flags); ax->rcount = 0; return; case ESC: set_bit(AXF_ESCAPE, &ax->flags); return; case ESC_ESC: if (test_and_clear_bit(AXF_ESCAPE, &ax->flags)) s = ESC; break; case ESC_END: if (test_and_clear_bit(AXF_ESCAPE, &ax->flags)) s = END; break; } spin_lock_bh(&ax->buflock); if (!test_bit(AXF_ERROR, &ax->flags)) { if (ax->rcount < ax->buffsize) { ax->rbuff[ax->rcount++] = s; spin_unlock_bh(&ax->buflock); return; } ax->dev->stats.rx_over_errors++; set_bit(AXF_ERROR, &ax->flags); } spin_unlock_bh(&ax->buflock); } static int ax_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr_ax25 *sa = addr; netif_tx_lock_bh(dev); netif_addr_lock(dev); memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN); netif_addr_unlock(dev); netif_tx_unlock_bh(dev); return 0; } /*---------------------------------------------------------------------------*/ static void ax_changedmtu(struct mkiss *ax) { struct net_device *dev = ax->dev; unsigned char *xbuff, *rbuff, *oxbuff, *orbuff; int len; len = dev->mtu * 2; /* * allow for arrival of larger UDP packets, even if we say not to * also fixes a bug in which SunOS sends 512-byte packets even with * an MSS of 128 */ if (len < 576 * 2) len = 576 * 2; xbuff = kmalloc(len + 4, GFP_ATOMIC); rbuff = kmalloc(len + 4, GFP_ATOMIC); if (xbuff == NULL || rbuff == NULL) { printk(KERN_ERR "mkiss: %s: unable to grow ax25 buffers, " "MTU change cancelled.\n", ax->dev->name); dev->mtu = ax->mtu; kfree(xbuff); kfree(rbuff); return; } spin_lock_bh(&ax->buflock); oxbuff = ax->xbuff; ax->xbuff = xbuff; orbuff = ax->rbuff; ax->rbuff = rbuff; if (ax->xleft) { if (ax->xleft <= len) { memcpy(ax->xbuff, ax->xhead, ax->xleft); } else { ax->xleft = 0; dev->stats.tx_dropped++; } } ax->xhead = ax->xbuff; if (ax->rcount) { if (ax->rcount <= len) { memcpy(ax->rbuff, orbuff, ax->rcount); } else { ax->rcount = 0; dev->stats.rx_over_errors++; set_bit(AXF_ERROR, &ax->flags); } } ax->mtu = dev->mtu + 73; ax->buffsize = len; spin_unlock_bh(&ax->buflock); kfree(oxbuff); kfree(orbuff); } /* Encapsulate one AX.25 packet and stuff into a TTY queue. */ static void ax_encaps(struct net_device *dev, unsigned char *icp, int len) { struct mkiss *ax = netdev_priv(dev); unsigned char *p; int actual, count; if (ax->mtu != ax->dev->mtu + 73) /* Someone has been ifconfigging */ ax_changedmtu(ax); if (len > ax->mtu) { /* Sigh, shouldn't occur BUT ... */ printk(KERN_ERR "mkiss: %s: truncating oversized transmit packet!\n", ax->dev->name); dev->stats.tx_dropped++; netif_start_queue(dev); return; } p = icp; spin_lock_bh(&ax->buflock); if ((*p & 0x0f) != 0) { /* Configuration Command (kissparms(1). * Protocol spec says: never append CRC. * This fixes a very old bug in the linux * kiss driver. -- dl9sau */ switch (*p & 0xff) { case 0x85: /* command from userspace especially for us, * not for delivery to the tnc */ if (len > 1) { int cmd = (p[1] & 0xff); switch(cmd) { case 3: ax->crcmode = CRC_MODE_SMACK; break; case 2: ax->crcmode = CRC_MODE_FLEX; break; case 1: ax->crcmode = CRC_MODE_NONE; break; case 0: default: ax->crcmode = CRC_MODE_SMACK_TEST; cmd = 0; } ax->crcauto = (cmd ? 0 : 1); printk(KERN_INFO "mkiss: %s: crc mode set to %d\n", ax->dev->name, cmd); } spin_unlock_bh(&ax->buflock); netif_start_queue(dev); return; default: count = kiss_esc(p, ax->xbuff, len); } } else { unsigned short crc; switch (ax->crcmode) { case CRC_MODE_SMACK_TEST: ax->crcmode = CRC_MODE_FLEX_TEST; printk(KERN_INFO "mkiss: %s: Trying crc-smack\n", ax->dev->name); // fall through case CRC_MODE_SMACK: *p |= 0x80; crc = swab16(crc16(0, p, len)); count = kiss_esc_crc(p, ax->xbuff, crc, len+2); break; case CRC_MODE_FLEX_TEST: ax->crcmode = CRC_MODE_NONE; printk(KERN_INFO "mkiss: %s: Trying crc-flexnet\n", ax->dev->name); // fall through case CRC_MODE_FLEX: *p |= 0x20; crc = calc_crc_flex(p, len); count = kiss_esc_crc(p, ax->xbuff, crc, len+2); break; default: count = kiss_esc(p, ax->xbuff, len); } } spin_unlock_bh(&ax->buflock); set_bit(TTY_DO_WRITE_WAKEUP, &ax->tty->flags); actual = ax->tty->ops->write(ax->tty, ax->xbuff, count); dev->stats.tx_packets++; dev->stats.tx_bytes += actual; netif_trans_update(ax->dev); ax->xleft = count - actual; ax->xhead = ax->xbuff + actual; } /* Encapsulate an AX.25 packet and kick it into a TTY queue. */ static netdev_tx_t ax_xmit(struct sk_buff *skb, struct net_device *dev) { struct mkiss *ax = netdev_priv(dev); if (skb->protocol == htons(ETH_P_IP)) return ax25_ip_xmit(skb); if (!netif_running(dev)) { printk(KERN_ERR "mkiss: %s: xmit call when iface is down\n", dev->name); return NETDEV_TX_BUSY; } if (netif_queue_stopped(dev)) { /* * May be we must check transmitter timeout here ? * 14 Oct 1994 Dmitry Gorodchanin. */ if (time_before(jiffies, dev_trans_start(dev) + 20 * HZ)) { /* 20 sec timeout not reached */ return NETDEV_TX_BUSY; } printk(KERN_ERR "mkiss: %s: transmit timed out, %s?\n", dev->name, (tty_chars_in_buffer(ax->tty) || ax->xleft) ? "bad line quality" : "driver error"); ax->xleft = 0; clear_bit(TTY_DO_WRITE_WAKEUP, &ax->tty->flags); netif_start_queue(dev); } /* We were not busy, so we are now... :-) */ netif_stop_queue(dev); ax_encaps(dev, skb->data, skb->len); kfree_skb(skb); return NETDEV_TX_OK; } static int ax_open_dev(struct net_device *dev) { struct mkiss *ax = netdev_priv(dev); if (ax->tty == NULL) return -ENODEV; return 0; } /* Open the low-level part of the AX25 channel. Easy! */ static int ax_open(struct net_device *dev) { struct mkiss *ax = netdev_priv(dev); unsigned long len; if (ax->tty == NULL) return -ENODEV; /* * Allocate the frame buffers: * * rbuff Receive buffer. * xbuff Transmit buffer. */ len = dev->mtu * 2; /* * allow for arrival of larger UDP packets, even if we say not to * also fixes a bug in which SunOS sends 512-byte packets even with * an MSS of 128 */ if (len < 576 * 2) len = 576 * 2; if ((ax->rbuff = kmalloc(len + 4, GFP_KERNEL)) == NULL) goto norbuff; if ((ax->xbuff = kmalloc(len + 4, GFP_KERNEL)) == NULL) goto noxbuff; ax->mtu = dev->mtu + 73; ax->buffsize = len; ax->rcount = 0; ax->xleft = 0; ax->flags &= (1 << AXF_INUSE); /* Clear ESCAPE & ERROR flags */ spin_lock_init(&ax->buflock); return 0; noxbuff: kfree(ax->rbuff); norbuff: return -ENOMEM; } /* Close the low-level part of the AX25 channel. Easy! */ static int ax_close(struct net_device *dev) { struct mkiss *ax = netdev_priv(dev); if (ax->tty) clear_bit(TTY_DO_WRITE_WAKEUP, &ax->tty->flags); netif_stop_queue(dev); return 0; } static const struct net_device_ops ax_netdev_ops = { .ndo_open = ax_open_dev, .ndo_stop = ax_close, .ndo_start_xmit = ax_xmit, .ndo_set_mac_address = ax_set_mac_address, }; static void ax_setup(struct net_device *dev) { /* Finish setting up the DEVICE info. */ dev->mtu = AX_MTU; dev->hard_header_len = AX25_MAX_HEADER_LEN; dev->addr_len = AX25_ADDR_LEN; dev->type = ARPHRD_AX25; dev->tx_queue_len = 10; dev->header_ops = &ax25_header_ops; dev->netdev_ops = &ax_netdev_ops; memcpy(dev->broadcast, &ax25_bcast, AX25_ADDR_LEN); memcpy(dev->dev_addr, &ax25_defaddr, AX25_ADDR_LEN); dev->flags = IFF_BROADCAST | IFF_MULTICAST; } /* * We have a potential race on dereferencing tty->disc_data, because the tty * layer provides no locking at all - thus one cpu could be running * sixpack_receive_buf while another calls sixpack_close, which zeroes * tty->disc_data and frees the memory that sixpack_receive_buf is using. The * best way to fix this is to use a rwlock in the tty struct, but for now we * use a single global rwlock for all ttys in ppp line discipline. */ static DEFINE_RWLOCK(disc_data_lock); static struct mkiss *mkiss_get(struct tty_struct *tty) { struct mkiss *ax; read_lock(&disc_data_lock); ax = tty->disc_data; if (ax) atomic_inc(&ax->refcnt); read_unlock(&disc_data_lock); return ax; } static void mkiss_put(struct mkiss *ax) { if (atomic_dec_and_test(&ax->refcnt)) up(&ax->dead_sem); } static int crc_force = 0; /* Can be overridden with insmod */ static int mkiss_open(struct tty_struct *tty) { struct net_device *dev; struct mkiss *ax; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (tty->ops->write == NULL) return -EOPNOTSUPP; dev = alloc_netdev(sizeof(struct mkiss), "ax%d", NET_NAME_UNKNOWN, ax_setup); if (!dev) { err = -ENOMEM; goto out; } ax = netdev_priv(dev); ax->dev = dev; spin_lock_init(&ax->buflock); atomic_set(&ax->refcnt, 1); sema_init(&ax->dead_sem, 0); ax->tty = tty; tty->disc_data = ax; tty->receive_room = 65535; tty_driver_flush_buffer(tty); /* Restore default settings */ dev->type = ARPHRD_AX25; /* Perform the low-level AX25 initialization. */ err = ax_open(ax->dev); if (err) goto out_free_netdev; err = register_netdev(dev); if (err) goto out_free_buffers; /* after register_netdev() - because else printk smashes the kernel */ switch (crc_force) { case 3: ax->crcmode = CRC_MODE_SMACK; printk(KERN_INFO "mkiss: %s: crc mode smack forced.\n", ax->dev->name); break; case 2: ax->crcmode = CRC_MODE_FLEX; printk(KERN_INFO "mkiss: %s: crc mode flexnet forced.\n", ax->dev->name); break; case 1: ax->crcmode = CRC_MODE_NONE; printk(KERN_INFO "mkiss: %s: crc mode disabled.\n", ax->dev->name); break; case 0: /* fall through */ default: crc_force = 0; printk(KERN_INFO "mkiss: %s: crc mode is auto.\n", ax->dev->name); ax->crcmode = CRC_MODE_SMACK_TEST; } ax->crcauto = (crc_force ? 0 : 1); netif_start_queue(dev); /* Done. We have linked the TTY line to a channel. */ return 0; out_free_buffers: kfree(ax->rbuff); kfree(ax->xbuff); out_free_netdev: free_netdev(dev); out: return err; } static void mkiss_close(struct tty_struct *tty) { struct mkiss *ax; write_lock_irq(&disc_data_lock); ax = tty->disc_data; tty->disc_data = NULL; write_unlock_irq(&disc_data_lock); if (!ax) return; /* * We have now ensured that nobody can start using ap from now on, but * we have to wait for all existing users to finish. */ if (!atomic_dec_and_test(&ax->refcnt)) down(&ax->dead_sem); /* * Halt the transmit queue so that a new transmit cannot scribble * on our buffers */ netif_stop_queue(ax->dev); unregister_netdev(ax->dev); /* Free all AX25 frame buffers after unreg. */ kfree(ax->rbuff); kfree(ax->xbuff); ax->tty = NULL; free_netdev(ax->dev); } /* Perform I/O control on an active ax25 channel. */ static int mkiss_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) { struct mkiss *ax = mkiss_get(tty); struct net_device *dev; unsigned int tmp, err; /* First make sure we're connected. */ if (ax == NULL) return -ENXIO; dev = ax->dev; switch (cmd) { case SIOCGIFNAME: err = copy_to_user((void __user *) arg, ax->dev->name, strlen(ax->dev->name) + 1) ? -EFAULT : 0; break; case SIOCGIFENCAP: err = put_user(4, (int __user *) arg); break; case SIOCSIFENCAP: if (get_user(tmp, (int __user *) arg)) { err = -EFAULT; break; } ax->mode = tmp; dev->addr_len = AX25_ADDR_LEN; dev->hard_header_len = AX25_KISS_HEADER_LEN + AX25_MAX_HEADER_LEN + 3; dev->type = ARPHRD_AX25; err = 0; break; case SIOCSIFHWADDR: { char addr[AX25_ADDR_LEN]; if (copy_from_user(&addr, (void __user *) arg, AX25_ADDR_LEN)) { err = -EFAULT; break; } netif_tx_lock_bh(dev); memcpy(dev->dev_addr, addr, AX25_ADDR_LEN); netif_tx_unlock_bh(dev); err = 0; break; } default: err = -ENOIOCTLCMD; } mkiss_put(ax); return err; } #ifdef CONFIG_COMPAT static long mkiss_compat_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { case SIOCGIFNAME: case SIOCGIFENCAP: case SIOCSIFENCAP: case SIOCSIFHWADDR: return mkiss_ioctl(tty, file, cmd, (unsigned long)compat_ptr(arg)); } return -ENOIOCTLCMD; } #endif /* * Handle the 'receiver data ready' interrupt. * This function is called by the 'tty_io' module in the kernel when * a block of data has been received, which can now be decapsulated * and sent on to the AX.25 layer for further processing. */ static void mkiss_receive_buf(struct tty_struct *tty, const unsigned char *cp, char *fp, int count) { struct mkiss *ax = mkiss_get(tty); if (!ax) return; /* * Argh! mtu change time! - costs us the packet part received * at the change */ if (ax->mtu != ax->dev->mtu + 73) ax_changedmtu(ax); /* Read the characters out of the buffer */ while (count--) { if (fp != NULL && *fp++) { if (!test_and_set_bit(AXF_ERROR, &ax->flags)) ax->dev->stats.rx_errors++; cp++; continue; } kiss_unesc(ax, *cp++); } mkiss_put(ax); tty_unthrottle(tty); } /* * Called by the driver when there's room for more data. If we have * more packets to send, we send them here. */ static void mkiss_write_wakeup(struct tty_struct *tty) { struct mkiss *ax = mkiss_get(tty); int actual; if (!ax) return; if (ax->xleft <= 0) { /* Now serial buffer is almost free & we can start * transmission of another packet */ clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); netif_wake_queue(ax->dev); goto out; } actual = tty->ops->write(tty, ax->xhead, ax->xleft); ax->xleft -= actual; ax->xhead += actual; out: mkiss_put(ax); } static struct tty_ldisc_ops ax_ldisc = { .owner = THIS_MODULE, .magic = TTY_LDISC_MAGIC, .name = "mkiss", .open = mkiss_open, .close = mkiss_close, .ioctl = mkiss_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = mkiss_compat_ioctl, #endif .receive_buf = mkiss_receive_buf, .write_wakeup = mkiss_write_wakeup }; static const char banner[] __initconst = KERN_INFO \ "mkiss: AX.25 Multikiss, Hans Albas PE1AYX\n"; static const char msg_regfail[] __initconst = KERN_ERR \ "mkiss: can't register line discipline (err = %d)\n"; static int __init mkiss_init_driver(void) { int status; printk(banner); status = tty_register_ldisc(N_AX25, &ax_ldisc); if (status != 0) printk(msg_regfail, status); return status; } static const char msg_unregfail[] = KERN_ERR \ "mkiss: can't unregister line discipline (err = %d)\n"; static void __exit mkiss_exit_driver(void) { int ret; if ((ret = tty_unregister_ldisc(N_AX25))) printk(msg_unregfail, ret); } MODULE_AUTHOR("Ralf Baechle DL5RB <ralf@linux-mips.org>"); MODULE_DESCRIPTION("KISS driver for AX.25 over TTYs"); module_param(crc_force, int, 0); MODULE_PARM_DESC(crc_force, "crc [0 = auto | 1 = none | 2 = flexnet | 3 = smack]"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_AX25); module_init(mkiss_init_driver); module_exit(mkiss_exit_driver); |
213 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BTREE_H__ #define __XFS_BTREE_H__ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; extern kmem_zone_t *xfs_btree_cur_zone; /* * Generic key, ptr and record wrapper structures. * * These are disk format structures, and are converted where necessary * by the btree specific code that needs to interpret them. */ union xfs_btree_ptr { __be32 s; /* short form ptr */ __be64 l; /* long form ptr */ }; /* * The in-core btree key. Overlapping btrees actually store two keys * per pointer, so we reserve enough memory to hold both. The __*bigkey * items should never be accessed directly. */ union xfs_btree_key { struct xfs_bmbt_key bmbt; xfs_bmdr_key_t bmbr; /* bmbt root block */ xfs_alloc_key_t alloc; struct xfs_inobt_key inobt; struct xfs_rmap_key rmap; struct xfs_rmap_key __rmap_bigkey[2]; struct xfs_refcount_key refc; }; union xfs_btree_rec { struct xfs_bmbt_rec bmbt; xfs_bmdr_rec_t bmbr; /* bmbt root block */ struct xfs_alloc_rec alloc; struct xfs_inobt_rec inobt; struct xfs_rmap_rec rmap; struct xfs_refcount_rec refc; }; /* * This nonsense is to make -wlint happy. */ #define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) #define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) #define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) #define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) #define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) #define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) #define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) #define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); /* * For logging record fields. */ #define XFS_BB_MAGIC (1 << 0) #define XFS_BB_LEVEL (1 << 1) #define XFS_BB_NUMRECS (1 << 2) #define XFS_BB_LEFTSIB (1 << 3) #define XFS_BB_RIGHTSIB (1 << 4) #define XFS_BB_BLKNO (1 << 5) #define XFS_BB_LSN (1 << 6) #define XFS_BB_UUID (1 << 7) #define XFS_BB_OWNER (1 << 8) #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 #define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface */ #define XFS_BTREE_STATS_INC(cur, stat) \ XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) #define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ struct xfs_btree_ops { /* size of the key and record structures */ size_t key_len; size_t rec_len; /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); void (*update_cursor)(struct xfs_btree_cur *src, struct xfs_btree_cur *dst); /* update btree root pointer */ void (*set_root)(struct xfs_btree_cur *cur, union xfs_btree_ptr *nptr, int level_change); /* block allocation / freeing */ int (*alloc_block)(struct xfs_btree_cur *cur, union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* update last record information */ void (*update_lastrec)(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_rec *rec, int ptr, int reason); /* records in block/level */ int (*get_minrecs)(struct xfs_btree_cur *cur, int level); int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); /* records on disk. Matter for the root in inode case. */ int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); void (*init_high_key_from_rec)(union xfs_btree_key *key, union xfs_btree_rec *rec); /* difference between key value and cursor value */ int64_t (*key_diff)(struct xfs_btree_cur *cur, union xfs_btree_key *key); /* * Difference between key2 and key1 -- positive if key1 > key2, * negative if key1 < key2, and zero if equal. */ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, union xfs_btree_key *key1, union xfs_btree_key *key2); const struct xfs_buf_ops *buf_ops; /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2); /* check that r1 is lower than r2 */ int (*recs_inorder)(struct xfs_btree_cur *cur, union xfs_btree_rec *r1, union xfs_btree_rec *r2); }; /* * Reasons for the update_lastrec method to be called. */ #define LASTREC_UPDATE 0 #define LASTREC_INSREC 1 #define LASTREC_DELREC 2 union xfs_btree_irec { struct xfs_alloc_rec_incore a; struct xfs_bmbt_irec b; struct xfs_inobt_rec_incore i; struct xfs_rmap_irec r; struct xfs_refcount_irec rc; }; /* Per-AG btree private information. */ union xfs_btree_cur_private { struct { unsigned long nr_ops; /* # record updates */ int shape_changes; /* # of extent splits */ } refc; }; /* * Btree cursor structure. * This collects all information needed by the btree code in one place. */ typedef struct xfs_btree_cur { struct xfs_trans *bc_tp; /* transaction we're in, if any */ struct xfs_mount *bc_mp; /* file system mount struct */ const struct xfs_btree_ops *bc_ops; uint bc_flags; /* btree features - below */ union xfs_btree_irec bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ #define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ #define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ xfs_agnumber_t agno; /* ag number */ union xfs_btree_cur_private priv; } a; struct { /* needed for BMAP */ struct xfs_inode *ip; /* pointer to our inode */ int allocated; /* count of alloced */ short forksize; /* fork's inode space */ char whichfork; /* data or attr fork */ char flags; /* flags */ #define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ #define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ } b; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; /* cursor flags */ #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 /* * Convert from buffer to btree block header. */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) /* * Internal long and short btree block checks. They return NULL if the * block is ok or the address of the failed check otherwise. */ xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); /* * Check that block header is ok. */ int xfs_btree_check_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ struct xfs_buf *bp); /* buffer containing block, if any */ /* * Check that (long) pointer is ok. */ bool /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lptr( struct xfs_btree_cur *cur, /* btree cursor */ xfs_fsblock_t fsbno, /* btree block disk address */ int level); /* btree block level */ /* * Check that (short) pointer is ok. */ bool /* error (0 or EFSCORRUPTED) */ xfs_btree_check_sptr( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t agbno, /* btree block disk address */ int level); /* btree block level */ /* * Delete the btree cursor. */ void xfs_btree_del_cursor( xfs_btree_cur_t *cur, /* btree cursor */ int error); /* del because of error */ /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ int /* error */ xfs_btree_dup_cursor( xfs_btree_cur_t *cur, /* input cursor */ xfs_btree_cur_t **ncur);/* output cursor */ /* * Get a buffer for the block, return it with no data read. * Long-form addressing. */ struct xfs_buf * /* buffer for fsbno */ xfs_btree_get_bufl( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t fsbno, /* file system block number */ uint lock); /* lock flags for get_buf */ /* * Get a buffer for the block, return it with no data read. * Short-form addressing. */ struct xfs_buf * /* buffer for agno/agbno */ xfs_btree_get_bufs( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ uint lock); /* lock flags for get_buf */ /* * Check for the cursor referring to the last block at the given level. */ int /* 1=is last block, 0=not last block */ xfs_btree_islastblock( xfs_btree_cur_t *cur, /* btree cursor */ int level); /* level to check */ /* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ void xfs_btree_offsets( int64_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last); /* output: last byte offset */ /* * Get a buffer for the block, return it read in. * Long-form addressing. */ int /* error */ xfs_btree_read_bufl( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t fsbno, /* file system block number */ uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. * Long-form addressing. */ void /* error */ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ xfs_extlen_t count, /* count of filesystem blocks */ const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. * Short-form addressing. */ void /* error */ xfs_btree_reada_bufs( struct xfs_mount *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ xfs_extlen_t count, /* count of filesystem blocks */ const struct xfs_buf_ops *ops); /* * Initialise a new btree block header */ void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_buf *bp, xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, unsigned int flags); void xfs_btree_init_block_int( struct xfs_mount *mp, struct xfs_btree_block *buf, xfs_daddr_t blkno, xfs_btnum_t btnum, __u16 level, __u16 numrecs, __u64 owner, unsigned int flags); /* * Common btree core entry points. */ int xfs_btree_increment(struct xfs_btree_cur *, int, int *); int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, struct list_head *buffer_list); /* * btree block CRC helpers */ void xfs_btree_lblock_calc_crc(struct xfs_buf *); bool xfs_btree_lblock_verify_crc(struct xfs_buf *); void xfs_btree_sblock_calc_crc(struct xfs_buf *); bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* * Helpers. */ static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) { return be16_to_cpu(block->bb_numrecs); } static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, uint16_t numrecs) { block->bb_numrecs = cpu_to_be16(numrecs); } static inline int xfs_btree_get_level(struct xfs_btree_block *block) { return be16_to_cpu(block->bb_level); } /* * Min and max functions for extlen, agblock, fileoff, and filblks types. */ #define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) #define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) #define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) #define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) #define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) #define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ #define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv); int xfs_btree_query_range(struct xfs_btree_cur *cur, union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, void *priv); typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, void *data); int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b); void xfs_btree_get_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_ptr *ptr, int lr); void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); #endif /* __XFS_BTREE_H__ */ |
50 48 48 18 12 28 10 27 50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | /* * Copyright (c) 2013 * Phillip Lougher <phillip@squashfs.org.uk> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. */ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "decompressor.h" #include "squashfs.h" /* * This file implements single-threaded decompression in the * decompressor framework */ struct squashfs_stream { void *stream; struct mutex mutex; }; void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, void *comp_opts) { struct squashfs_stream *stream; int err = -ENOMEM; stream = kmalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) goto out; stream->stream = msblk->decompressor->init(msblk, comp_opts); if (IS_ERR(stream->stream)) { err = PTR_ERR(stream->stream); goto out; } kfree(comp_opts); mutex_init(&stream->mutex); return stream; out: kfree(stream); return ERR_PTR(err); } void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) { struct squashfs_stream *stream = msblk->stream; if (stream) { msblk->decompressor->free(stream->stream); kfree(stream); } } int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; mutex_lock(&stream->mutex); res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, offset, length, output); mutex_unlock(&stream->mutex); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", msblk->decompressor->name); return res; } int squashfs_max_decompressors(void) { return 1; } |
216 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UDF_SB_H #define __LINUX_UDF_SB_H #include <linux/mutex.h> #include <linux/bitops.h> #include <linux/magic.h> #define UDF_MAX_READ_VERSION 0x0250 #define UDF_MAX_WRITE_VERSION 0x0201 #define UDF_FLAG_USE_EXTENDED_FE 0 #define UDF_VERS_USE_EXTENDED_FE 0x0200 #define UDF_FLAG_USE_STREAMS 1 #define UDF_VERS_USE_STREAMS 0x0200 #define UDF_FLAG_USE_SHORT_AD 2 #define UDF_FLAG_USE_AD_IN_ICB 3 #define UDF_FLAG_USE_FILE_CTIME_EA 4 #define UDF_FLAG_STRICT 5 #define UDF_FLAG_UNDELETE 6 #define UDF_FLAG_UNHIDE 7 #define UDF_FLAG_VARCONV 8 #define UDF_FLAG_NLS_MAP 9 #define UDF_FLAG_UTF8 10 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ #define UDF_FLAG_GID_FORGET 12 #define UDF_FLAG_UID_SET 13 #define UDF_FLAG_GID_SET 14 #define UDF_FLAG_SESSION_SET 15 #define UDF_FLAG_LASTBLOCK_SET 16 #define UDF_FLAG_BLOCKSIZE_SET 17 #define UDF_FLAG_INCONSISTENT 18 #define UDF_FLAG_RW_INCOMPAT 19 /* Set when we find RW incompatible * feature */ #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 #define UDF_PART_FLAG_FREED_BITMAP 0x0004 #define UDF_PART_FLAG_FREED_TABLE 0x0008 #define UDF_PART_FLAG_READ_ONLY 0x0010 #define UDF_PART_FLAG_WRITE_ONCE 0x0020 #define UDF_PART_FLAG_REWRITABLE 0x0040 #define UDF_PART_FLAG_OVERWRITABLE 0x0080 #define UDF_MAX_BLOCK_LOADED 8 #define UDF_TYPE1_MAP15 0x1511U #define UDF_VIRTUAL_MAP15 0x1512U #define UDF_VIRTUAL_MAP20 0x2012U #define UDF_SPARABLE_MAP15 0x1522U #define UDF_METADATA_MAP25 0x2511U #define UDF_INVALID_MODE ((umode_t)-1) #pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc; __u32 s_bitmap_file_loc; __u32 s_alloc_unit_size; __u16 s_align_unit_size; /* * Partition Reference Number of the associated physical / sparable * partition */ __u16 s_phys_partition_ref; int s_flags; struct inode *s_metadata_fe; struct inode *s_mirror_fe; struct inode *s_bitmap_fe; }; struct udf_sparing_data { __u16 s_packet_len; struct buffer_head *s_spar_map[4]; }; struct udf_virtual_data { __u32 s_num_entries; __u16 s_start_offset; }; struct udf_bitmap { __u32 s_extPosition; int s_nr_groups; struct buffer_head *s_block_bitmap[0]; }; struct udf_part_map { union { struct udf_bitmap *s_bitmap; struct inode *s_table; } s_uspace; union { struct udf_bitmap *s_bitmap; struct inode *s_table; } s_fspace; __u32 s_partition_root; __u32 s_partition_len; __u16 s_partition_type; __u16 s_partition_num; union { struct udf_sparing_data s_sparing; struct udf_virtual_data s_virtual; struct udf_meta_data s_metadata; } s_type_specific; __u32 (*s_partition_func)(struct super_block *, __u32, __u16, __u32); __u16 s_volumeseqnum; __u16 s_partition_flags; }; #pragma pack() struct udf_sb_info { struct udf_part_map *s_partmaps; __u8 s_volume_ident[32]; /* Overall info */ __u16 s_partitions; __u16 s_partition; /* Sector headers */ __s32 s_session; __u32 s_anchor; __u32 s_last_block; struct buffer_head *s_lvid_bh; /* Default permissions */ umode_t s_umask; kgid_t s_gid; kuid_t s_uid; umode_t s_fmode; umode_t s_dmode; /* Lock protecting consistency of above permission settings */ rwlock_t s_cred_lock; /* Root Info */ struct timespec64 s_record_time; /* Fileset Info */ __u16 s_serial_number; /* highest UDF revision we have recorded to this media */ __u16 s_udfrev; /* Miscellaneous flags */ unsigned long s_flags; /* Encoding info */ struct nls_table *s_nls_map; /* VAT inode */ struct inode *s_vat_inode; struct mutex s_alloc_mutex; /* Protected by s_alloc_mutex */ unsigned int s_lvid_dirty; }; static inline struct udf_sb_info *UDF_SB(struct super_block *sb) { return sb->s_fs_info; } struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb); int udf_compute_nr_groups(struct super_block *sb, u32 partition); static inline int UDF_QUERY_FLAG(struct super_block *sb, int flag) { return test_bit(flag, &UDF_SB(sb)->s_flags); } static inline void UDF_SET_FLAG(struct super_block *sb, int flag) { set_bit(flag, &UDF_SB(sb)->s_flags); } static inline void UDF_CLEAR_FLAG(struct super_block *sb, int flag) { clear_bit(flag, &UDF_SB(sb)->s_flags); } #endif /* __LINUX_UDF_SB_H */ |
5 4 175 135 110 10 5 25 276 330 144 76 83 139 75 74 74 33 69 75 130 128 126 126 126 130 76 76 2 22 4 25 76 57 57 57 79 79 49 9 43 43 41 39 5 34 38 37 36 2 49 8 3 2 4 6 8 16 16 16 1 15 3 2 4 10 9 5 2 16 38 38 5 32 33 38 9 8 8 9 18 17 6 18 33 3 33 33 33 32 30 30 32 3 29 3 38 37 38 38 1 34 34 38 41 15 27 66 32 66 62 66 66 66 66 86 86 86 85 86 103 170 170 63 169 124 119 162 11 81 33 48 3 45 44 36 36 108 25 24 22 21 11 84 105 38 60 44 24 24 106 170 61 60 43 89 53 89 89 89 55 31 24 24 39 35 35 35 69 53 89 89 119 90 1 10 11 10 1 20 15 3 84 124 124 102 101 7 123 33 33 29 29 26 25 26 17 17 5 11 2 2 12 1 12 8 23 1 4 23 22 3 22 9 2 30 97 95 95 95 93 93 93 93 89 89 5 4 6 5 90 89 83 89 85 26 26 154 86 176 84 243 86 119 139 157 68 156 202 166 4 1 5 5 11 60 244 240 198 164 303 159 184 89 184 184 283 283 283 256 160 160 283 299 28 28 11 11 10 11 10 28 27 10 17 32 32 29 1 10 15 14 13 1 13 1 12 1 11 29 31 3 3 5 5 1 3 5 8 5 5 2 2 2 2 1 2 8 3 1 2 3 144 144 143 143 148 112 60 53 53 53 53 53 9 107 112 15 15 15 15 309 310 126 144 143 143 143 4 139 54 174 179 3 176 179 174 17 16 146 36 3 4 3 4 4 1 1 167 26 4 2 22 20 4 3 2 1 4 1 17 26 23 23 3 12 82 2 1 1 20 20 20 16 16 16 16 21 21 20 21 21 7 45 45 45 45 40 4 4 4 4 4 4 4 4 4 4 4 1 17 12 12 12 12 12 12 12 12 17 12 17 4 16 14 17 13 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 | /* * net/tipc/socket.c: TIPC socket API * * Copyright (c) 2001-2007, 2012-2017, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <linux/rhashtable.h> #include <linux/sched/signal.h> #include "core.h" #include "name_table.h" #include "node.h" #include "link.h" #include "name_distr.h" #include "socket.h" #include "bcast.h" #include "netlink.h" #include "group.h" #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ #define TIPC_FWD_MSG 1 #define TIPC_MAX_PORT 0xffffffff #define TIPC_MIN_PORT 1 #define TIPC_ACK_RATE 4 /* ACK at 1/4 of of rcv window size */ enum { TIPC_LISTEN = TCP_LISTEN, TIPC_ESTABLISHED = TCP_ESTABLISHED, TIPC_OPEN = TCP_CLOSE, TIPC_DISCONNECTING = TCP_CLOSE_WAIT, TIPC_CONNECTING = TCP_SYN_SENT, }; struct sockaddr_pair { struct sockaddr_tipc sock; struct sockaddr_tipc member; }; /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API * @conn_type: TIPC type used when connection was established * @conn_instance: TIPC instance used when connection was established * @published: non-zero if port has one or more associated names * @max_pkt: maximum packet size "hint" used when building messages sent by port * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages * #cong_links: list of congested links * @publications: list of publications for port * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime * @probing_state: * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links * @snt_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer * @peer: 'connected' peer for dgram/rdm * @node: hash table node * @mc_method: cookie for use between socket and broadcast layer * @rcu: rcu struct for tipc_sock */ struct tipc_sock { struct sock sk; u32 conn_type; u32 conn_instance; int published; u32 max_pkt; u32 portid; struct tipc_msg phdr; struct list_head cong_links; struct list_head publications; u32 pub_count; uint conn_timeout; atomic_t dupl_rcvcnt; bool probe_unacked; u16 cong_link_cnt; u16 snt_unacked; u16 snd_win; u16 peer_caps; u16 rcv_unacked; u16 rcv_win; struct sockaddr_tipc peer; struct rhash_head node; struct tipc_mc_method mc_method; struct rcu_head rcu; struct tipc_group *group; bool group_is_open; }; static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern); static void tipc_sk_timeout(struct timer_list *t); static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_leave(struct tipc_sock *tsk); static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; static const struct proto_ops msg_ops; static struct proto tipc_proto; static const struct rhashtable_params tsk_rht_params; static u32 tsk_own_node(struct tipc_sock *tsk) { return msg_prevnode(&tsk->phdr); } static u32 tsk_peer_node(struct tipc_sock *tsk) { return msg_destnode(&tsk->phdr); } static u32 tsk_peer_port(struct tipc_sock *tsk) { return msg_destport(&tsk->phdr); } static bool tsk_unreliable(struct tipc_sock *tsk) { return msg_src_droppable(&tsk->phdr) != 0; } static void tsk_set_unreliable(struct tipc_sock *tsk, bool unreliable) { msg_set_src_droppable(&tsk->phdr, unreliable ? 1 : 0); } static bool tsk_unreturnable(struct tipc_sock *tsk) { return msg_dest_droppable(&tsk->phdr) != 0; } static void tsk_set_unreturnable(struct tipc_sock *tsk, bool unreturnable) { msg_set_dest_droppable(&tsk->phdr, unreturnable ? 1 : 0); } static int tsk_importance(struct tipc_sock *tsk) { return msg_importance(&tsk->phdr); } static int tsk_set_importance(struct tipc_sock *tsk, int imp) { if (imp > TIPC_CRITICAL_IMPORTANCE) return -EINVAL; msg_set_importance(&tsk->phdr, (u32)imp); return 0; } static struct tipc_sock *tipc_sk(const struct sock *sk) { return container_of(sk, struct tipc_sock, sk); } static bool tsk_conn_cong(struct tipc_sock *tsk) { return tsk->snt_unacked > tsk->snd_win; } static u16 tsk_blocks(int len) { return ((len / FLOWCTL_BLK_SZ) + 1); } /* tsk_blocks(): translate a buffer size in bytes to number of * advertisable blocks, taking into account the ratio truesize(len)/len * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ */ static u16 tsk_adv_blocks(int len) { return len / FLOWCTL_BLK_SZ / 4; } /* tsk_inc(): increment counter for sent or received data * - If block based flow control is not supported by peer we * fall back to message based ditto, incrementing the counter */ static u16 tsk_inc(struct tipc_sock *tsk, int msglen) { if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) return ((msglen / FLOWCTL_BLK_SZ) + 1); return 1; } /** * tsk_advance_rx_queue - discard first buffer in socket receive queue * * Caller must hold socket lock */ static void tsk_advance_rx_queue(struct sock *sk) { kfree_skb(__skb_dequeue(&sk->sk_receive_queue)); } /* tipc_sk_respond() : send response message back to sender */ static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err) { u32 selector; u32 dnode; u32 onode = tipc_own_addr(sock_net(sk)); if (!tipc_msg_reverse(onode, &skb, err)) return; dnode = msg_destnode(buf_msg(skb)); selector = msg_origport(buf_msg(skb)); tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); } /** * tsk_rej_rx_queue - reject all buffers in socket receive queue * * Caller must hold socket lock */ static void tsk_rej_rx_queue(struct sock *sk) { struct sk_buff *skb; while ((skb = __skb_dequeue(&sk->sk_receive_queue))) tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT); } static bool tipc_sk_connected(struct sock *sk) { return sk->sk_state == TIPC_ESTABLISHED; } /* tipc_sk_type_connectionless - check if the socket is datagram socket * @sk: socket * * Returns true if connection less, false otherwise */ static bool tipc_sk_type_connectionless(struct sock *sk) { return sk->sk_type == SOCK_RDM || sk->sk_type == SOCK_DGRAM; } /* tsk_peer_msg - verify if message was sent by connected port's peer * * Handles cases where the node's network address has changed from * the default of <0.0.0> to its configured setting. */ static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) { struct sock *sk = &tsk->sk; u32 self = tipc_own_addr(sock_net(sk)); u32 peer_port = tsk_peer_port(tsk); u32 orig_node, peer_node; if (unlikely(!tipc_sk_connected(sk))) return false; if (unlikely(msg_origport(msg) != peer_port)) return false; orig_node = msg_orignode(msg); peer_node = tsk_peer_node(tsk); if (likely(orig_node == peer_node)) return true; if (!orig_node && peer_node == self) return true; if (!peer_node && orig_node == self) return true; return false; } /* tipc_set_sk_state - set the sk_state of the socket * @sk: socket * * Caller must hold socket lock * * Returns 0 on success, errno otherwise */ static int tipc_set_sk_state(struct sock *sk, int state) { int oldsk_state = sk->sk_state; int res = -EINVAL; switch (state) { case TIPC_OPEN: res = 0; break; case TIPC_LISTEN: case TIPC_CONNECTING: if (oldsk_state == TIPC_OPEN) res = 0; break; case TIPC_ESTABLISHED: if (oldsk_state == TIPC_CONNECTING || oldsk_state == TIPC_OPEN) res = 0; break; case TIPC_DISCONNECTING: if (oldsk_state == TIPC_CONNECTING || oldsk_state == TIPC_ESTABLISHED) res = 0; break; } if (!res) sk->sk_state = state; return res; } static int tipc_sk_sock_err(struct socket *sock, long *timeout) { struct sock *sk = sock->sk; int err = sock_error(sk); int typ = sock->type; if (err) return err; if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) { if (sk->sk_state == TIPC_DISCONNECTING) return -EPIPE; else if (!tipc_sk_connected(sk)) return -ENOTCONN; } if (!*timeout) return -EAGAIN; if (signal_pending(current)) return sock_intr_errno(*timeout); return 0; } #define tipc_wait_for_cond(sock_, timeo_, condition_) \ ({ \ DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ struct sock *sk_; \ int rc_; \ \ while ((rc_ = !(condition_))) { \ /* coupled with smp_wmb() in tipc_sk_proto_rcv() */ \ smp_rmb(); \ sk_ = (sock_)->sk; \ rc_ = tipc_sk_sock_err((sock_), timeo_); \ if (rc_) \ break; \ prepare_to_wait(sk_sleep(sk_), &wait_, TASK_INTERRUPTIBLE); \ release_sock(sk_); \ *(timeo_) = wait_woken(&wait_, TASK_INTERRUPTIBLE, *(timeo_)); \ sched_annotate_sleep(); \ lock_sock(sk_); \ remove_wait_queue(sk_sleep(sk_), &wait_); \ } \ rc_; \ }) /** * tipc_sk_create - create a TIPC socket * @net: network namespace (must be default network) * @sock: pre-allocated socket structure * @protocol: protocol indicator (must be 0) * @kern: caused by kernel or by userspace? * * This routine creates additional data structures used by the TIPC socket, * initializes them, and links them together. * * Returns 0 on success, errno otherwise */ static int tipc_sk_create(struct net *net, struct socket *sock, int protocol, int kern) { const struct proto_ops *ops; struct sock *sk; struct tipc_sock *tsk; struct tipc_msg *msg; /* Validate arguments */ if (unlikely(protocol != 0)) return -EPROTONOSUPPORT; switch (sock->type) { case SOCK_STREAM: ops = &stream_ops; break; case SOCK_SEQPACKET: ops = &packet_ops; break; case SOCK_DGRAM: case SOCK_RDM: ops = &msg_ops; break; default: return -EPROTOTYPE; } /* Allocate socket's protocol area */ sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern); if (sk == NULL) return -ENOMEM; tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; /* Finish initializing socket data structures */ sock->ops = ops; sock_init_data(sock, sk); tipc_set_sk_state(sk, TIPC_OPEN); if (tipc_sk_insert(tsk)) { sk_free(sk); pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } /* Ensure tsk is visible before we read own_addr. */ smp_mb(); tipc_msg_init(tipc_own_addr(net), msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, NAMED_H_SIZE, 0); msg_set_origport(msg, tsk->portid); timer_setup(&sk->sk_timer, tipc_sk_timeout, 0); sk->sk_shutdown = 0; sk->sk_backlog_rcv = tipc_sk_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; sk->sk_destruct = tipc_sock_destruct; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; tsk->group_is_open = true; atomic_set(&tsk->dupl_rcvcnt, 0); /* Start out with safe limits until we receive an advertised window */ tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN); tsk->rcv_win = tsk->snd_win; if (tipc_sk_type_connectionless(sk)) { tsk_set_unreturnable(tsk, true); if (sock->type == SOCK_DGRAM) tsk_set_unreliable(tsk, true); } return 0; } static void tipc_sk_callback(struct rcu_head *head) { struct tipc_sock *tsk = container_of(head, struct tipc_sock, rcu); sock_put(&tsk->sk); } /* Caller should hold socket lock for the socket. */ static void __tipc_shutdown(struct socket *sock, int error) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); u32 dnode = tsk_peer_node(tsk); struct sk_buff *skb; /* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */ tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (TIPC_SKB_CB(skb)->bytes_read) { kfree_skb(skb); continue; } if (!tipc_sk_type_connectionless(sk) && sk->sk_state != TIPC_DISCONNECTING) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); tipc_node_remove_conn(net, dnode, tsk->portid); } tipc_sk_respond(sk, skb, error); } if (tipc_sk_type_connectionless(sk)) return; if (sk->sk_state != TIPC_DISCONNECTING) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, tsk_own_node(tsk), tsk_peer_port(tsk), tsk->portid, error); if (skb) tipc_node_xmit_skb(net, skb, dnode, tsk->portid); tipc_node_remove_conn(net, dnode, tsk->portid); tipc_set_sk_state(sk, TIPC_DISCONNECTING); } } /** * tipc_release - destroy a TIPC socket * @sock: socket to destroy * * This routine cleans up any messages that are still queued on the socket. * For DGRAM and RDM socket types, all queued messages are rejected. * For SEQPACKET and STREAM socket types, the first message is rejected * and any others are discarded. (If the first message on a STREAM socket * is partially-read, it is discarded and the next one is rejected instead.) * * NOTE: Rejected messages are not necessarily returned to the sender! They * are returned or discarded according to the "destination droppable" setting * specified for the message by the sender. * * Returns 0 on success, errno otherwise */ static int tipc_release(struct socket *sock) { struct sock *sk = sock->sk; struct tipc_sock *tsk; /* * Exit if socket isn't fully initialized (occurs when a failed accept() * releases a pre-allocated child socket that was never used) */ if (sk == NULL) return 0; tsk = tipc_sk(sk); lock_sock(sk); __tipc_shutdown(sock, TIPC_ERR_NO_PORT); sk->sk_shutdown = SHUTDOWN_MASK; tipc_sk_leave(tsk); tipc_sk_withdraw(tsk, 0, NULL); sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); sock_orphan(sk); /* Reject any messages that accumulated in backlog queue */ release_sock(sk); tipc_dest_list_purge(&tsk->cong_links); tsk->cong_link_cnt = 0; call_rcu(&tsk->rcu, tipc_sk_callback); sock->sk = NULL; return 0; } /** * tipc_bind - associate or disassocate TIPC name(s) with a socket * @sock: socket structure * @uaddr: socket address describing name(s) and desired operation * @uaddr_len: size of socket address data structure * * Name and name sequence binding is indicated using a positive scope value; * a negative scope value unbinds the specified name. Specifying no name * (i.e. a socket address length of 0) unbinds all names from the socket. * * Returns 0 on success, errno otherwise * * NOTE: This routine doesn't need to take the socket lock since it doesn't * access any non-constant socket information. */ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, int uaddr_len) { struct sock *sk = sock->sk; struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; struct tipc_sock *tsk = tipc_sk(sk); int res = -EINVAL; lock_sock(sk); if (unlikely(!uaddr_len)) { res = tipc_sk_withdraw(tsk, 0, NULL); goto exit; } if (tsk->group) { res = -EACCES; goto exit; } if (uaddr_len < sizeof(struct sockaddr_tipc)) { res = -EINVAL; goto exit; } if (addr->family != AF_TIPC) { res = -EAFNOSUPPORT; goto exit; } if (addr->addrtype == TIPC_ADDR_NAME) addr->addr.nameseq.upper = addr->addr.nameseq.lower; else if (addr->addrtype != TIPC_ADDR_NAMESEQ) { res = -EAFNOSUPPORT; goto exit; } if ((addr->addr.nameseq.type < TIPC_RESERVED_TYPES) && (addr->addr.nameseq.type != TIPC_TOP_SRV) && (addr->addr.nameseq.type != TIPC_CFG_SRV)) { res = -EACCES; goto exit; } res = (addr->scope >= 0) ? tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) : tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); exit: release_sock(sk); return res; } /** * tipc_getname - get port ID of socket or peer socket * @sock: socket structure * @uaddr: area for returned socket address * @uaddr_len: area for returned length of socket address * @peer: 0 = own ID, 1 = current peer ID, 2 = current/former peer ID * * Returns 0 on success, errno otherwise * * NOTE: This routine doesn't need to take the socket lock since it only * accesses socket information that is unchanging (or which changes in * a completely predictable manner). */ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); memset(addr, 0, sizeof(*addr)); if (peer) { if ((!tipc_sk_connected(sk)) && ((peer != 2) || (sk->sk_state != TIPC_DISCONNECTING))) return -ENOTCONN; addr->addr.id.ref = tsk_peer_port(tsk); addr->addr.id.node = tsk_peer_node(tsk); } else { addr->addr.id.ref = tsk->portid; addr->addr.id.node = tipc_own_addr(sock_net(sk)); } addr->addrtype = TIPC_ADDR_ID; addr->family = AF_TIPC; addr->scope = 0; addr->addr.name.domain = 0; return sizeof(*addr); } /** * tipc_poll - read and possibly block on pollmask * @file: file structure associated with the socket * @sock: socket for which to calculate the poll bits * @wait: ??? * * Returns pollmask value * * COMMENTARY: * It appears that the usual socket locking mechanisms are not useful here * since the pollmask info is potentially out-of-date the moment this routine * exits. TCP and other protocols seem to rely on higher level poll routines * to handle any preventable race conditions, so TIPC will do the same ... * * IMPORTANT: The fact that a read or write operation is indicated does NOT * imply that the operation will succeed, merely that it should be performed * and will not block. */ static __poll_t tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); __poll_t revents = 0; sock_poll_wait(file, sock, wait); if (sk->sk_shutdown & RCV_SHUTDOWN) revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) revents |= EPOLLHUP; switch (sk->sk_state) { case TIPC_ESTABLISHED: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) revents |= EPOLLOUT; /* fall thru' */ case TIPC_LISTEN: case TIPC_CONNECTING: if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) revents |= EPOLLIN | EPOLLRDNORM; break; case TIPC_OPEN: if (tsk->group_is_open && !tsk->cong_link_cnt) revents |= EPOLLOUT; if (!tipc_sk_type_connectionless(sk)) break; if (skb_queue_empty_lockless(&sk->sk_receive_queue)) break; revents |= EPOLLIN | EPOLLRDNORM; break; case TIPC_DISCONNECTING: revents = EPOLLIN | EPOLLRDNORM | EPOLLHUP; break; } return revents; } /** * tipc_sendmcast - send multicast message * @sock: socket structure * @seq: destination address * @msg: message to send * @dlen: length of data to send * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks * Returns the number of bytes sent on success, or errno */ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct msghdr *msg, size_t dlen, long timeout) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); int mtu = tipc_bcast_get_mtu(net); struct tipc_mc_method *method = &tsk->mc_method; struct sk_buff_head pkts; struct tipc_nlist dsts; int rc; if (tsk->group) return -EACCES; /* Block or return if any destination link is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); if (unlikely(rc)) return rc; /* Lookup destination nodes */ tipc_nlist_init(&dsts, tipc_own_addr(net)); tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower, seq->upper, &dsts); if (!dsts.local && !dsts.remote) return -EHOSTUNREACH; /* Build message header */ msg_set_type(hdr, TIPC_MCAST_MSG); msg_set_hdr_sz(hdr, MCAST_H_SIZE); msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE); msg_set_destport(hdr, 0); msg_set_destnode(hdr, 0); msg_set_nametype(hdr, seq->type); msg_set_namelower(hdr, seq->lower); msg_set_nameupper(hdr, seq->upper); /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts); /* Send message if build was successful */ if (unlikely(rc == dlen)) rc = tipc_mcast_xmit(net, &pkts, method, &dsts, &tsk->cong_link_cnt); tipc_nlist_purge(&dsts); return rc ? rc : dlen; } /** * tipc_send_group_msg - send a message to a member in the group * @net: network namespace * @m: message to send * @mb: group member * @dnode: destination node * @dport: destination port * @dlen: total length of message data */ static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, struct msghdr *m, struct tipc_member *mb, u32 dnode, u32 dport, int dlen) { u16 bc_snd_nxt = tipc_group_bc_snd_nxt(tsk->group); struct tipc_mc_method *method = &tsk->mc_method; int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; struct sk_buff_head pkts; int mtu, rc; /* Complete message header */ msg_set_type(hdr, TIPC_GRP_UCAST_MSG); msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, dport); msg_set_destnode(hdr, dnode); msg_set_grp_bc_seqno(hdr, bc_snd_nxt); /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); mtu = tipc_node_get_mtu(net, dnode, tsk->portid); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; /* Send message */ rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tipc_dest_push(&tsk->cong_links, dnode, 0); tsk->cong_link_cnt++; } /* Update send window */ tipc_group_update_member(mb, blks); /* A broadcast sent within next EXPIRE period must follow same path */ method->rcast = true; method->mandatory = true; return dlen; } /** * tipc_send_group_unicast - send message to a member in the group * @sock: socket structure * @m: message to send * @dlen: total length of message data * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks * Returns the number of bytes sent on success, or errno */ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); struct tipc_member *mb = NULL; u32 node, port; int rc; node = dest->addr.id.node; port = dest->addr.id.ref; if (!port && !node) return -EHOSTUNREACH; /* Block or return if destination link or member is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tipc_dest_find(&tsk->cong_links, node, 0) && tsk->group && !tipc_group_cong(tsk->group, node, port, blks, &mb)); if (unlikely(rc)) return rc; if (unlikely(!mb)) return -EHOSTUNREACH; rc = tipc_send_group_msg(net, tsk, m, mb, node, port, dlen); return rc ? rc : dlen; } /** * tipc_send_group_anycast - send message to any member with given identity * @sock: socket structure * @m: message to send * @dlen: total length of message data * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks * Returns the number of bytes sent on success, or errno */ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct list_head *cong_links = &tsk->cong_links; int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; struct tipc_member *first = NULL; struct tipc_member *mbr = NULL; struct net *net = sock_net(sk); u32 node, port, exclude; struct list_head dsts; u32 type, inst, scope; int lookups = 0; int dstcnt, rc; bool cong; INIT_LIST_HEAD(&dsts); type = msg_nametype(hdr); inst = dest->addr.name.name.instance; scope = msg_lookup_scope(hdr); while (++lookups < 4) { exclude = tipc_group_exclude(tsk->group); first = NULL; /* Look for a non-congested destination member, if any */ while (1) { if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, &dstcnt, exclude, false)) return -EHOSTUNREACH; tipc_dest_pop(&dsts, &node, &port); cong = tipc_group_cong(tsk->group, node, port, blks, &mbr); if (!cong) break; if (mbr == first) break; if (!first) first = mbr; } /* Start over if destination was not in member list */ if (unlikely(!mbr)) continue; if (likely(!cong && !tipc_dest_find(cong_links, node, 0))) break; /* Block or return if destination link or member is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tipc_dest_find(cong_links, node, 0) && tsk->group && !tipc_group_cong(tsk->group, node, port, blks, &mbr)); if (unlikely(rc)) return rc; /* Send, unless destination disappeared while waiting */ if (likely(mbr)) break; } if (unlikely(lookups >= 4)) return -EHOSTUNREACH; rc = tipc_send_group_msg(net, tsk, m, mbr, node, port, dlen); return rc ? rc : dlen; } /** * tipc_send_group_bcast - send message to all members in communication group * @sk: socket structure * @m: message to send * @dlen: total length of message data * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks * Returns the number of bytes sent on success, or errno */ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_nlist *dsts; struct tipc_mc_method *method = &tsk->mc_method; bool ack = method->mandatory && method->rcast; int blks = tsk_blocks(MCAST_H_SIZE + dlen); struct tipc_msg *hdr = &tsk->phdr; int mtu = tipc_bcast_get_mtu(net); struct sk_buff_head pkts; int rc = -EHOSTUNREACH; /* Block or return if any destination link or member is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt && tsk->group && !tipc_group_bc_cong(tsk->group, blks)); if (unlikely(rc)) return rc; dsts = tipc_group_dests(tsk->group); if (!dsts->local && !dsts->remote) return -EHOSTUNREACH; /* Complete message header */ if (dest) { msg_set_type(hdr, TIPC_GRP_MCAST_MSG); msg_set_nameinst(hdr, dest->addr.name.name.instance); } else { msg_set_type(hdr, TIPC_GRP_BCAST_MSG); msg_set_nameinst(hdr, 0); } msg_set_hdr_sz(hdr, GROUP_H_SIZE); msg_set_destport(hdr, 0); msg_set_destnode(hdr, 0); msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(tsk->group)); /* Avoid getting stuck with repeated forced replicasts */ msg_set_grp_bc_ack_req(hdr, ack); /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; /* Send message */ rc = tipc_mcast_xmit(net, &pkts, method, dsts, &tsk->cong_link_cnt); if (unlikely(rc)) return rc; /* Update broadcast sequence number and send windows */ tipc_group_update_bc_members(tsk->group, blks, ack); /* Broadcast link is now free to choose method for next broadcast */ method->mandatory = false; method->expires = jiffies; return dlen; } /** * tipc_send_group_mcast - send message to all members with given identity * @sock: socket structure * @m: message to send * @dlen: total length of message data * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks * Returns the number of bytes sent on success, or errno */ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); u32 type, inst, scope, exclude; struct list_head dsts; u32 dstcnt; INIT_LIST_HEAD(&dsts); type = msg_nametype(hdr); inst = dest->addr.name.name.instance; scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, &dstcnt, exclude, true)) return -EHOSTUNREACH; if (dstcnt == 1) { tipc_dest_pop(&dsts, &dest->addr.id.node, &dest->addr.id.ref); return tipc_send_group_unicast(sock, m, dlen, timeout); } tipc_dest_list_purge(&dsts); return tipc_send_group_bcast(sock, m, dlen, timeout); } /** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets * @arrvq: queue with arriving messages, to be cloned after destination lookup * @inputq: queue with cloned messages, delivered to socket after dest lookup * * Multi-threaded: parallel calls with reference to same queues may occur */ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { u32 self = tipc_own_addr(net); u32 type, lower, upper, scope; struct sk_buff *skb, *_skb; u32 portid, onode; struct sk_buff_head tmpq; struct list_head dports; struct tipc_msg *hdr; int user, mtyp, hlen; bool exact; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { hdr = buf_msg(skb); user = msg_user(hdr); mtyp = msg_type(hdr); hlen = skb_headroom(skb) + msg_hdr_sz(hdr); onode = msg_orignode(hdr); type = msg_nametype(hdr); if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { __skb_dequeue(arrvq); __skb_queue_tail(inputq, skb); } kfree_skb(skb); spin_unlock_bh(&inputq->lock); continue; } /* Group messages require exact scope match */ if (msg_in_group(hdr)) { lower = 0; upper = ~0; scope = msg_lookup_scope(hdr); exact = true; } else { /* TIPC_NODE_SCOPE means "any scope" in this context */ if (onode == self) scope = TIPC_NODE_SCOPE; else scope = TIPC_CLUSTER_SCOPE; exact = false; lower = msg_namelower(hdr); upper = msg_nameupper(hdr); } /* Create destination port list: */ tipc_nametbl_mc_lookup(net, type, lower, upper, scope, exact, &dports); /* Clone message per destination */ while (tipc_dest_pop(&dports, NULL, &portid)) { _skb = __pskb_copy(skb, hlen, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); __skb_queue_tail(&tmpq, _skb); continue; } pr_warn("Failed to clone mcast rcv buffer\n"); } /* Append to inputq if not already done by other thread */ spin_lock_bh(&inputq->lock); if (skb_peek(arrvq) == skb) { skb_queue_splice_tail_init(&tmpq, inputq); /* Decrease the skb's refcnt as increasing in the * function tipc_skb_peek */ kfree_skb(__skb_dequeue(arrvq)); } spin_unlock_bh(&inputq->lock); __skb_queue_purge(&tmpq); kfree_skb(skb); } tipc_sk_rcv(net, inputq); } /** * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket * @skb: pointer to message buffer. */ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); u32 onode = tsk_own_node(tsk); struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); bool conn_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) goto exit; if (unlikely(msg_errcode(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), tsk_peer_port(tsk)); sk->sk_state_change(sk); /* State change is ignored if socket already awake, * - convert msg to abort msg and add to inqueue */ msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE); msg_set_type(hdr, TIPC_CONN_MSG); msg_set_size(hdr, BASIC_H_SIZE); msg_set_hdr_sz(hdr, BASIC_H_SIZE); __skb_queue_tail(inputq, skb); return; } tsk->probe_unacked = false; if (mtyp == CONN_PROBE) { msg_set_type(hdr, CONN_PROBE_REPLY); if (tipc_msg_reverse(onode, &skb, TIPC_OK)) __skb_queue_tail(xmitq, skb); return; } else if (mtyp == CONN_ACK) { conn_cong = tsk_conn_cong(tsk); tsk->snt_unacked -= msg_conn_ack(hdr); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) tsk->snd_win = msg_adv_win(hdr); if (conn_cong) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { pr_warn("Received unknown CONN_PROTO msg\n"); } exit: kfree_skb(skb); } /** * tipc_sendmsg - send message in connectionless manner * @sock: socket structure * @m: message to send * @dsz: amount of user data to be sent * * Message must have an destination specified explicitly. * Used for SOCK_RDM and SOCK_DGRAM messages, * and for 'SYN' messages on SOCK_SEQPACKET and SOCK_STREAM connections. * (Note: 'SYN+' is prohibited on SOCK_STREAM.) * * Returns the number of bytes sent on success, or errno otherwise */ static int tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) { struct sock *sk = sock->sk; int ret; lock_sock(sk); ret = __tipc_sendmsg(sock, m, dsz); release_sock(sk); return ret; } static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); struct list_head *clinks = &tsk->cong_links; bool syn = !tipc_sk_type_connectionless(sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; u32 dport, dnode = 0; u32 type, inst; int mtu, rc; if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; if (likely(dest)) { if (unlikely(m->msg_namelen < sizeof(*dest))) return -EINVAL; if (unlikely(dest->family != AF_TIPC)) return -EINVAL; } if (grp) { if (!dest) return tipc_send_group_bcast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_NAME) return tipc_send_group_anycast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_ID) return tipc_send_group_unicast(sock, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_MCAST) return tipc_send_group_mcast(sock, m, dlen, timeout); return -EINVAL; } if (unlikely(!dest)) { dest = &tsk->peer; if (!syn && dest->family != AF_TIPC) return -EDESTADDRREQ; } if (unlikely(syn)) { if (sk->sk_state == TIPC_LISTEN) return -EPIPE; if (sk->sk_state != TIPC_OPEN) return -EISCONN; if (tsk->published) return -EOPNOTSUPP; if (dest->addrtype == TIPC_ADDR_NAME) { tsk->conn_type = dest->addr.name.name.type; tsk->conn_instance = dest->addr.name.name.instance; } } seq = &dest->addr.nameseq; if (dest->addrtype == TIPC_ADDR_MCAST) return tipc_sendmcast(sock, seq, m, dlen, timeout); if (dest->addrtype == TIPC_ADDR_NAME) { type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; dnode = dest->addr.name.domain; msg_set_type(hdr, TIPC_NAMED_MSG); msg_set_hdr_sz(hdr, NAMED_H_SIZE); msg_set_nametype(hdr, type); msg_set_nameinst(hdr, inst); msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); dport = tipc_nametbl_translate(net, type, inst, &dnode); msg_set_destnode(hdr, dnode); msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; msg_set_type(hdr, TIPC_DIRECT_MSG); msg_set_lookup_scope(hdr, 0); msg_set_destnode(hdr, dnode); msg_set_destport(hdr, dest->addr.id.ref); msg_set_hdr_sz(hdr, BASIC_H_SIZE); } else { return -EINVAL; } /* Block or return if destination link is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tipc_dest_find(clinks, dnode, 0)); if (unlikely(rc)) return rc; __skb_queue_head_init(&pkts); mtu = tipc_node_get_mtu(net, dnode, tsk->portid); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tipc_dest_push(clinks, dnode, 0); tsk->cong_link_cnt++; rc = 0; } if (unlikely(syn && !rc)) tipc_set_sk_state(sk, TIPC_CONNECTING); return rc ? rc : dlen; } /** * tipc_sendstream - send stream-oriented data * @sock: socket structure * @m: data to send * @dsz: total length of data to be transmitted * * Used for SOCK_STREAM data. * * Returns the number of bytes sent on success (or partial success), * or errno if no data sent */ static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz) { struct sock *sk = sock->sk; int ret; lock_sock(sk); ret = __tipc_sendstream(sock, m, dsz); release_sock(sk); return ret; } static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); struct sk_buff_head pkts; u32 dnode = tsk_peer_node(tsk); int send, sent = 0; int rc = 0; __skb_queue_head_init(&pkts); if (unlikely(dlen > INT_MAX)) return -EMSGSIZE; /* Handle implicit connection setup */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dlen); if (dlen && dlen == rc) { tsk->peer_caps = tipc_node_get_capabilities(net, dnode); tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr)); } return rc; } do { rc = tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk) && tipc_sk_connected(sk))); if (unlikely(rc)) break; send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts); if (unlikely(rc != send)) break; rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { tsk->cong_link_cnt = 1; rc = 0; } if (likely(!rc)) { tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE); sent += send; } } while (sent < dlen && !rc); return sent ? sent : rc; } /** * tipc_send_packet - send a connection-oriented message * @sock: socket structure * @m: message to send * @dsz: length of data to be transmitted * * Used for SOCK_SEQPACKET messages. * * Returns the number of bytes sent on success, or errno otherwise */ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz) { if (dsz > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; return tipc_sendstream(sock, m, dsz); } /* tipc_sk_finish_conn - complete the setup of a connection */ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, u32 peer_node) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct tipc_msg *msg = &tsk->phdr; msg_set_destnode(msg, peer_node); msg_set_destport(msg, peer_port); msg_set_type(msg, TIPC_CONN_MSG); msg_set_lookup_scope(msg, 0); msg_set_hdr_sz(msg, SHORT_H_SIZE); sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); tipc_set_sk_state(sk, TIPC_ESTABLISHED); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) return; /* Fall back to message based flow control */ tsk->rcv_win = FLOWCTL_MSG_WIN; tsk->snd_win = FLOWCTL_MSG_WIN; } /** * tipc_sk_set_orig_addr - capture sender's address for received message * @m: descriptor for message info * @hdr: received message header * * Note: Address is not captured if not requested by receiver. */ static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) { DECLARE_SOCKADDR(struct sockaddr_pair *, srcaddr, m->msg_name); struct tipc_msg *hdr = buf_msg(skb); if (!srcaddr) return; srcaddr->sock.family = AF_TIPC; srcaddr->sock.addrtype = TIPC_ADDR_ID; srcaddr->sock.scope = 0; srcaddr->sock.addr.id.ref = msg_origport(hdr); srcaddr->sock.addr.id.node = msg_orignode(hdr); srcaddr->sock.addr.name.domain = 0; m->msg_namelen = sizeof(struct sockaddr_tipc); if (!msg_in_group(hdr)) return; /* Group message users may also want to know sending member's id */ srcaddr->member.family = AF_TIPC; srcaddr->member.addrtype = TIPC_ADDR_NAME; srcaddr->member.scope = 0; srcaddr->member.addr.name.name.type = msg_nametype(hdr); srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member; srcaddr->member.addr.name.domain = 0; m->msg_namelen = sizeof(*srcaddr); } /** * tipc_sk_anc_data_recv - optionally capture ancillary data for received message * @m: descriptor for message info * @skb: received message buffer * @tsk: TIPC port associated with message * * Note: Ancillary data is not captured if not requested by receiver. * * Returns 0 if successful, otherwise errno */ static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, struct tipc_sock *tsk) { struct tipc_msg *msg; u32 anc_data[3]; u32 err; u32 dest_type; int has_name; int res; if (likely(m->msg_controllen == 0)) return 0; msg = buf_msg(skb); /* Optionally capture errored message object(s) */ err = msg ? msg_errcode(msg) : 0; if (unlikely(err)) { anc_data[0] = err; anc_data[1] = msg_data_sz(msg); res = put_cmsg(m, SOL_TIPC, TIPC_ERRINFO, 8, anc_data); if (res) return res; if (anc_data[1]) { if (skb_linearize(skb)) return -ENOMEM; msg = buf_msg(skb); res = put_cmsg(m, SOL_TIPC, TIPC_RETDATA, anc_data[1], msg_data(msg)); if (res) return res; } } /* Optionally capture message destination object */ dest_type = msg ? msg_type(msg) : TIPC_DIRECT_MSG; switch (dest_type) { case TIPC_NAMED_MSG: has_name = 1; anc_data[0] = msg_nametype(msg); anc_data[1] = msg_namelower(msg); anc_data[2] = msg_namelower(msg); break; case TIPC_MCAST_MSG: has_name = 1; anc_data[0] = msg_nametype(msg); anc_data[1] = msg_namelower(msg); anc_data[2] = msg_nameupper(msg); break; case TIPC_CONN_MSG: has_name = (tsk->conn_type != 0); anc_data[0] = tsk->conn_type; anc_data[1] = tsk->conn_instance; anc_data[2] = tsk->conn_instance; break; default: has_name = 0; } if (has_name) { res = put_cmsg(m, SOL_TIPC, TIPC_DESTNAME, 12, anc_data); if (res) return res; } return 0; } static void tipc_sk_send_ack(struct tipc_sock *tsk) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; struct tipc_msg *msg; u32 peer_port = tsk_peer_port(tsk); u32 dnode = tsk_peer_node(tsk); if (!tipc_sk_connected(sk)) return; skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0, dnode, tsk_own_node(tsk), peer_port, tsk->portid, TIPC_OK); if (!skb) return; msg = buf_msg(skb); msg_set_conn_ack(msg, tsk->rcv_unacked); tsk->rcv_unacked = 0; /* Adjust to and advertize the correct window limit */ if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) { tsk->rcv_win = tsk_adv_blocks(tsk->sk.sk_rcvbuf); msg_set_adv_win(msg, tsk->rcv_win); } tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); } static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) { struct sock *sk = sock->sk; DEFINE_WAIT(wait); long timeo = *timeop; int err = sock_error(sk); if (err) return err; for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { if (sk->sk_shutdown & RCV_SHUTDOWN) { err = -ENOTCONN; break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); } err = 0; if (!skb_queue_empty(&sk->sk_receive_queue)) break; err = -EAGAIN; if (!timeo) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; err = sock_error(sk); if (err) break; } finish_wait(sk_sleep(sk), &wait); *timeop = timeo; return err; } /** * tipc_recvmsg - receive packet-oriented message * @m: descriptor for message info * @buflen: length of user buffer area * @flags: receive flags * * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages. * If the complete message doesn't fit in user area, truncate it. * * Returns size of returned message data, errno otherwise */ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buflen, int flags) { struct sock *sk = sock->sk; bool connected = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; struct tipc_skb_cb *skb_cb; struct sk_buff_head xmitq; struct tipc_msg *hdr; struct sk_buff *skb; bool grp_evt; long timeout; /* Catch invalid receive requests */ if (unlikely(!buflen)) return -EINVAL; lock_sock(sk); if (unlikely(connected && sk->sk_state == TIPC_OPEN)) { rc = -ENOTCONN; goto exit; } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Step rcv queue to first msg with data or error; wait if necessary */ do { rc = tipc_wait_for_rcvmsg(sock, &timeout); if (unlikely(rc)) goto exit; skb = skb_peek(&sk->sk_receive_queue); skb_cb = TIPC_SKB_CB(skb); hdr = buf_msg(skb); dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); err = msg_errcode(hdr); grp_evt = msg_is_grp_evt(hdr); if (likely(dlen || err)) break; tsk_advance_rx_queue(sk); } while (1); /* Collect msg meta data, including error code and rejected data */ tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, skb, tsk); if (unlikely(rc)) goto exit; hdr = buf_msg(skb); /* Capture data if non-error msg, otherwise just set return value */ if (likely(!err)) { int offset = skb_cb->bytes_read; copy = min_t(int, dlen - offset, buflen); rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); if (unlikely(rc)) goto exit; if (unlikely(offset + copy < dlen)) { if (flags & MSG_EOR) { if (!(flags & MSG_PEEK)) skb_cb->bytes_read = offset + copy; } else { m->msg_flags |= MSG_TRUNC; skb_cb->bytes_read = 0; } } else { if (flags & MSG_EOR) m->msg_flags |= MSG_EOR; skb_cb->bytes_read = 0; } } else { copy = 0; rc = 0; if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) { rc = -ECONNRESET; goto exit; } } /* Mark message as group event if applicable */ if (unlikely(grp_evt)) { if (msg_grp_evt(hdr) == TIPC_WITHDRAWN) m->msg_flags |= MSG_EOR; m->msg_flags |= MSG_OOB; copy = 0; } /* Caption of data or error code/rejected data was successful */ if (unlikely(flags & MSG_PEEK)) goto exit; /* Send group flow control advertisement when applicable */ if (tsk->group && msg_in_group(hdr) && !grp_evt) { __skb_queue_head_init(&xmitq); tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen), msg_orignode(hdr), msg_origport(hdr), &xmitq); tipc_node_distr_xmit(sock_net(sk), &xmitq); } if (skb_cb->bytes_read) goto exit; tsk_advance_rx_queue(sk); if (likely(!connected)) goto exit; /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); exit: release_sock(sk); return rc ? rc : copy; } /** * tipc_recvstream - receive stream-oriented data * @m: descriptor for message info * @buflen: total size of user buffer area * @flags: receive flags * * Used for SOCK_STREAM messages only. If not enough data is available * will optionally wait for more; never truncates data. * * Returns size of returned message data, errno otherwise */ static int tipc_recvstream(struct socket *sock, struct msghdr *m, size_t buflen, int flags) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct sk_buff *skb; struct tipc_msg *hdr; struct tipc_skb_cb *skb_cb; bool peek = flags & MSG_PEEK; int offset, required, copy, copied = 0; int hlen, dlen, err, rc; long timeout; /* Catch invalid receive attempts */ if (unlikely(!buflen)) return -EINVAL; lock_sock(sk); if (unlikely(sk->sk_state == TIPC_OPEN)) { rc = -ENOTCONN; goto exit; } required = sock_rcvlowat(sk, flags & MSG_WAITALL, buflen); timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { /* Look at first msg in receive queue; wait if necessary */ rc = tipc_wait_for_rcvmsg(sock, &timeout); if (unlikely(rc)) break; skb = skb_peek(&sk->sk_receive_queue); skb_cb = TIPC_SKB_CB(skb); hdr = buf_msg(skb); dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); err = msg_errcode(hdr); /* Discard any empty non-errored (SYN-) message */ if (unlikely(!dlen && !err)) { tsk_advance_rx_queue(sk); continue; } /* Collect msg meta data, incl. error code and rejected data */ if (!copied) { tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, skb, tsk); if (rc) break; hdr = buf_msg(skb); } /* Copy data if msg ok, otherwise return error/partial data */ if (likely(!err)) { offset = skb_cb->bytes_read; copy = min_t(int, dlen - offset, buflen - copied); rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); if (unlikely(rc)) break; copied += copy; offset += copy; if (unlikely(offset < dlen)) { if (!peek) skb_cb->bytes_read = offset; break; } } else { rc = 0; if ((err != TIPC_CONN_SHUTDOWN) && !m->msg_control) rc = -ECONNRESET; if (copied || rc) break; } if (unlikely(peek)) break; tsk_advance_rx_queue(sk); /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)) tipc_sk_send_ack(tsk); /* Exit if all requested data or FIN/error received */ if (copied == buflen || err) break; } while (!skb_queue_empty(&sk->sk_receive_queue) || copied < required); exit: release_sock(sk); return copied ? copied : rc; } /** * tipc_write_space - wake up thread if port congestion is released * @sk: socket */ static void tipc_write_space(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); rcu_read_unlock(); } /** * tipc_data_ready - wake up threads to indicate messages have been received * @sk: socket * @len: the length of messages */ static void tipc_data_ready(struct sock *sk) { struct socket_wq *wq; rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); rcu_read_unlock(); } static void tipc_sock_destruct(struct sock *sk) { __skb_queue_purge(&sk->sk_receive_queue); } static void tipc_sk_proto_rcv(struct sock *sk, struct sk_buff_head *inputq, struct sk_buff_head *xmitq) { struct sk_buff *skb = __skb_dequeue(inputq); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); struct tipc_group *grp = tsk->group; bool wakeup = false; switch (msg_user(hdr)) { case CONN_MANAGER: tipc_sk_conn_proto_rcv(tsk, skb, inputq, xmitq); return; case SOCK_WAKEUP: tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); /* coupled with smp_rmb() in tipc_wait_for_cond() */ smp_wmb(); tsk->cong_link_cnt--; wakeup = true; break; case GROUP_PROTOCOL: tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); break; case TOP_SRV: tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf, hdr, inputq, xmitq); break; default: break; } if (wakeup) sk->sk_write_space(sk); kfree_skb(skb); } /** * tipc_filter_connect - Handle incoming message for a connection-based socket * @tsk: TIPC socket * @skb: pointer to message buffer. Set to NULL if buffer is consumed * * Returns true if everything ok, false otherwise */ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct tipc_msg *hdr = buf_msg(skb); u32 pport = msg_origport(hdr); u32 pnode = msg_orignode(hdr); if (unlikely(msg_mcast(hdr))) return false; switch (sk->sk_state) { case TIPC_CONNECTING: /* Accept only ACK or NACK message */ if (unlikely(!msg_connected(hdr))) { if (pport != tsk_peer_port(tsk) || pnode != tsk_peer_node(tsk)) return false; tipc_set_sk_state(sk, TIPC_DISCONNECTING); sk->sk_err = ECONNREFUSED; sk->sk_state_change(sk); return true; } if (unlikely(msg_errcode(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); sk->sk_err = ECONNREFUSED; sk->sk_state_change(sk); return true; } if (unlikely(!msg_isdata(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); sk->sk_err = EINVAL; sk->sk_state_change(sk); return true; } tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr)); msg_set_importance(&tsk->phdr, msg_importance(hdr)); /* If 'ACK+' message, add to socket receive queue */ if (msg_data_sz(hdr)) return true; /* If empty 'ACK-' message, wake up sleeping connect() */ sk->sk_state_change(sk); /* 'ACK-' message is neither accepted nor rejected: */ msg_set_dest_droppable(hdr, 1); return false; case TIPC_OPEN: case TIPC_DISCONNECTING: break; case TIPC_LISTEN: /* Accept only SYN message */ if (!msg_connected(hdr) && !(msg_errcode(hdr))) return true; break; case TIPC_ESTABLISHED: /* Accept only connection-based messages sent by peer */ if (unlikely(!tsk_peer_msg(tsk, hdr))) return false; if (unlikely(msg_errcode(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); /* Let timer expire on it's own */ tipc_node_remove_conn(net, tsk_peer_node(tsk), tsk->portid); sk->sk_state_change(sk); } return true; default: pr_err("Unknown sk_state %u\n", sk->sk_state); } return false; } /** * rcvbuf_limit - get proper overload limit of socket receive queue * @sk: socket * @skb: message * * For connection oriented messages, irrespective of importance, * default queue limit is 2 MB. * * For connectionless messages, queue limits are based on message * importance as follows: * * TIPC_LOW_IMPORTANCE (2 MB) * TIPC_MEDIUM_IMPORTANCE (4 MB) * TIPC_HIGH_IMPORTANCE (8 MB) * TIPC_CRITICAL_IMPORTANCE (16 MB) * * Returns overload limit according to corresponding message importance */ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) { struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); if (unlikely(msg_in_group(hdr))) return sk->sk_rcvbuf; if (unlikely(!msg_connected(hdr))) return sk->sk_rcvbuf << msg_importance(hdr); if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) return sk->sk_rcvbuf; return FLOWCTL_MSG_LIM; } /** * tipc_sk_filter_rcv - validate incoming message * @sk: socket * @skb: pointer to message. * * Enqueues message on receive queue if acceptable; optionally handles * disconnect indication for a connected socket. * * Called with socket lock already taken * */ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *xmitq) { bool sk_conn = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = buf_msg(skb); struct net *net = sock_net(sk); struct sk_buff_head inputq; int limit, err = TIPC_OK; TIPC_SKB_CB(skb)->bytes_read = 0; __skb_queue_head_init(&inputq); __skb_queue_tail(&inputq, skb); if (unlikely(!msg_isdata(hdr))) tipc_sk_proto_rcv(sk, &inputq, xmitq); if (unlikely(grp)) tipc_group_filter_msg(grp, &inputq, xmitq); /* Validate and add to receive buffer if there is space */ while ((skb = __skb_dequeue(&inputq))) { hdr = buf_msg(skb); limit = rcvbuf_limit(sk, skb); if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || (!sk_conn && msg_connected(hdr)) || (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { atomic_inc(&sk->sk_drops); err = TIPC_ERR_OVERLOAD; } if (unlikely(err)) { tipc_skb_reject(net, err, skb, xmitq); err = TIPC_OK; continue; } __skb_queue_tail(&sk->sk_receive_queue, skb); skb_set_owner_r(skb, sk); sk->sk_data_ready(sk); } } /** * tipc_sk_backlog_rcv - handle incoming message from backlog queue * @sk: socket * @skb: message * * Caller must hold socket lock */ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { unsigned int before = sk_rmem_alloc_get(sk); struct sk_buff_head xmitq; unsigned int added; __skb_queue_head_init(&xmitq); tipc_sk_filter_rcv(sk, skb, &xmitq); added = sk_rmem_alloc_get(sk) - before; atomic_add(added, &tipc_sk(sk)->dupl_rcvcnt); /* Send pending response/rejected messages, if any */ tipc_node_distr_xmit(sock_net(sk), &xmitq); return 0; } /** * tipc_sk_enqueue - extract all buffers with destination 'dport' from * inputq and try adding them to socket or backlog queue * @inputq: list of incoming buffers with potentially different destinations * @sk: socket where the buffers should be enqueued * @dport: port number for the socket * * Caller must hold socket lock */ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, u32 dport, struct sk_buff_head *xmitq) { unsigned long time_limit = jiffies + usecs_to_jiffies(20000); struct sk_buff *skb; unsigned int lim; atomic_t *dcnt; u32 onode; while (skb_queue_len(inputq)) { if (unlikely(time_after_eq(jiffies, time_limit))) return; skb = tipc_skb_dequeue(inputq, dport); if (unlikely(!skb)) return; /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { tipc_sk_filter_rcv(sk, skb, xmitq); continue; } /* Try backlog, compensating for double-counted bytes */ dcnt = &tipc_sk(sk)->dupl_rcvcnt; if (!sk->sk_backlog.len) atomic_set(dcnt, 0); lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); if (likely(!sk_add_backlog(sk, skb, lim))) continue; /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); atomic_inc(&sk->sk_drops); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) __skb_queue_tail(xmitq, skb); break; } } /** * tipc_sk_rcv - handle a chain of incoming buffers * @inputq: buffer list containing the buffers * Consumes all buffers in list until inputq is empty * Note: may be called in multiple threads referring to the same queue */ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) { struct sk_buff_head xmitq; u32 dnode, dport = 0; int err; struct tipc_sock *tsk; struct sock *sk; struct sk_buff *skb; __skb_queue_head_init(&xmitq); while (skb_queue_len(inputq)) { dport = tipc_skb_peek_port(inputq, dport); tsk = tipc_sk_lookup(net, dport); if (likely(tsk)) { sk = &tsk->sk; if (likely(spin_trylock_bh(&sk->sk_lock.slock))) { tipc_sk_enqueue(inputq, sk, dport, &xmitq); spin_unlock_bh(&sk->sk_lock.slock); } /* Send pending response/rejected messages, if any */ tipc_node_distr_xmit(sock_net(sk), &xmitq); sock_put(sk); continue; } /* No destination socket => dequeue skb if still there */ skb = tipc_skb_dequeue(inputq, dport); if (!skb) return; /* Try secondary lookup if unresolved named message */ err = TIPC_ERR_NO_PORT; if (tipc_msg_lookup_dest(net, skb, &err)) goto xmit; /* Prepare for message rejection */ if (!tipc_msg_reverse(tipc_own_addr(net), &skb, err)) continue; xmit: dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, dport); } } static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk; int done; do { int err = sock_error(sk); if (err) return err; if (!*timeo_p) return -ETIMEDOUT; if (signal_pending(current)) return sock_intr_errno(*timeo_p); add_wait_queue(sk_sleep(sk), &wait); done = sk_wait_event(sk, timeo_p, sk->sk_state != TIPC_CONNECTING, &wait); remove_wait_queue(sk_sleep(sk), &wait); } while (!done); return 0; } static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr) { if (addr->family != AF_TIPC) return false; if (addr->addrtype == TIPC_SERVICE_RANGE) return (addr->addr.nameseq.lower <= addr->addr.nameseq.upper); return (addr->addrtype == TIPC_SERVICE_ADDR || addr->addrtype == TIPC_SOCKET_ADDR); } /** * tipc_connect - establish a connection to another TIPC port * @sock: socket structure * @dest: socket address for destination port * @destlen: size of socket address data structure * @flags: file-related flags associated with socket * * Returns 0 on success, errno otherwise */ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int destlen, int flags) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct sockaddr_tipc *dst = (struct sockaddr_tipc *)dest; struct msghdr m = {NULL,}; long timeout = (flags & O_NONBLOCK) ? 0 : tsk->conn_timeout; int previous; int res = 0; if (destlen != sizeof(struct sockaddr_tipc)) return -EINVAL; lock_sock(sk); if (tsk->group) { res = -EINVAL; goto exit; } if (dst->family == AF_UNSPEC) { memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); if (!tipc_sk_type_connectionless(sk)) res = -EINVAL; goto exit; } if (!tipc_sockaddr_is_sane(dst)) { res = -EINVAL; goto exit; } /* DGRAM/RDM connect(), just save the destaddr */ if (tipc_sk_type_connectionless(sk)) { memcpy(&tsk->peer, dest, destlen); goto exit; } else if (dst->addrtype == TIPC_SERVICE_RANGE) { res = -EINVAL; goto exit; } previous = sk->sk_state; switch (sk->sk_state) { case TIPC_OPEN: /* Send a 'SYN-' to destination */ m.msg_name = dest; m.msg_namelen = destlen; /* If connect is in non-blocking case, set MSG_DONTWAIT to * indicate send_msg() is never blocked. */ if (!timeout) m.msg_flags = MSG_DONTWAIT; res = __tipc_sendmsg(sock, &m, 0); if ((res < 0) && (res != -EWOULDBLOCK)) goto exit; /* Just entered TIPC_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ res = -EINPROGRESS; /* fall thru' */ case TIPC_CONNECTING: if (!timeout) { if (previous == TIPC_CONNECTING) res = -EALREADY; goto exit; } timeout = msecs_to_jiffies(timeout); /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ res = tipc_wait_for_connect(sock, &timeout); break; case TIPC_ESTABLISHED: res = -EISCONN; break; default: res = -EINVAL; } exit: release_sock(sk); return res; } /** * tipc_listen - allow socket to listen for incoming connections * @sock: socket structure * @len: (unused) * * Returns 0 on success, errno otherwise */ static int tipc_listen(struct socket *sock, int len) { struct sock *sk = sock->sk; int res; lock_sock(sk); res = tipc_set_sk_state(sk, TIPC_LISTEN); release_sock(sk); return res; } static int tipc_wait_for_accept(struct socket *sock, long timeo) { struct sock *sk = sock->sk; DEFINE_WAIT_FUNC(wait, woken_wake_function); int err; /* True wake-one mechanism for incoming connections: only * one process gets woken up, not the 'whole herd'. * Since we do not 'race & poll' for established sockets * anymore, the common case will execute the loop only once. */ for (;;) { if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { add_wait_queue(sk_sleep(sk), &wait); release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); remove_wait_queue(sk_sleep(sk), &wait); } err = 0; if (!skb_queue_empty(&sk->sk_receive_queue)) break; err = -EAGAIN; if (!timeo) break; err = sock_intr_errno(timeo); if (signal_pending(current)) break; } return err; } /** * tipc_accept - wait for connection request * @sock: listening socket * @newsock: new socket that is to be connected * @flags: file-related flags associated with socket * * Returns 0 on success, errno otherwise */ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern) { struct sock *new_sk, *sk = sock->sk; struct sk_buff *buf; struct tipc_sock *new_tsock; struct tipc_msg *msg; long timeo; int res; lock_sock(sk); if (sk->sk_state != TIPC_LISTEN) { res = -EINVAL; goto exit; } timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); res = tipc_wait_for_accept(sock, timeo); if (res) goto exit; buf = skb_peek(&sk->sk_receive_queue); res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern); if (res) goto exit; security_sk_clone(sock->sk, new_sock->sk); new_sk = new_sock->sk; new_tsock = tipc_sk(new_sk); msg = buf_msg(buf); /* we lock on new_sk; but lockdep sees the lock on sk */ lock_sock_nested(new_sk, SINGLE_DEPTH_NESTING); /* * Reject any stray messages received by new socket * before the socket lock was taken (very, very unlikely) */ tsk_rej_rx_queue(new_sk); /* Connect new socket to it's peer */ tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); tsk_set_importance(new_tsock, msg_importance(msg)); if (msg_named(msg)) { new_tsock->conn_type = msg_nametype(msg); new_tsock->conn_instance = msg_nameinst(msg); } /* * Respond to 'SYN-' by discarding it & returning 'ACK'-. * Respond to 'SYN+' by queuing it on new socket. */ if (!msg_data_sz(msg)) { struct msghdr m = {NULL,}; tsk_advance_rx_queue(sk); __tipc_sendstream(new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); __skb_queue_head(&new_sk->sk_receive_queue, buf); skb_set_owner_r(buf, new_sk); } release_sock(new_sk); exit: release_sock(sk); return res; } /** * tipc_shutdown - shutdown socket connection * @sock: socket structure * @how: direction to close (must be SHUT_RDWR) * * Terminates connection (if necessary), then purges socket's receive queue. * * Returns 0 on success, errno otherwise */ static int tipc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int res; if (how != SHUT_RDWR) return -EINVAL; lock_sock(sk); __tipc_shutdown(sock, TIPC_CONN_SHUTDOWN); sk->sk_shutdown = SHUTDOWN_MASK; if (sk->sk_state == TIPC_DISCONNECTING) { /* Discard any unreceived messages */ __skb_queue_purge(&sk->sk_receive_queue); res = 0; } else { res = -ENOTCONN; } /* Wake up anyone sleeping in poll. */ sk->sk_state_change(sk); release_sock(sk); return res; } static void tipc_sk_timeout(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct tipc_sock *tsk = tipc_sk(sk); u32 peer_port = tsk_peer_port(tsk); u32 peer_node = tsk_peer_node(tsk); u32 own_node = tsk_own_node(tsk); u32 own_port = tsk->portid; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; bh_lock_sock(sk); if (!tipc_sk_connected(sk)) goto exit; /* Try again later if socket is busy */ if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); goto exit; } if (tsk->probe_unacked) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); tipc_node_remove_conn(net, peer_node, peer_port); sk->sk_state_change(sk); goto exit; } /* Send new probe */ skb = tipc_msg_create(CONN_MANAGER, CONN_PROBE, INT_H_SIZE, 0, peer_node, own_node, peer_port, own_port, TIPC_OK); tsk->probe_unacked = true; sk_reset_timer(sk, &sk->sk_timer, jiffies + CONN_PROBING_INTV); exit: bh_unlock_sock(sk); if (skb) tipc_node_xmit_skb(net, skb, peer_node, own_port); sock_put(sk); } static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct publication *publ; u32 key; if (scope != TIPC_NODE_SCOPE) scope = TIPC_CLUSTER_SCOPE; if (tipc_sk_connected(sk)) return -EINVAL; key = tsk->portid + tsk->pub_count + 1; if (key == tsk->portid) return -EADDRINUSE; publ = tipc_nametbl_publish(net, seq->type, seq->lower, seq->upper, scope, tsk->portid, key); if (unlikely(!publ)) return -EINVAL; list_add(&publ->binding_sock, &tsk->publications); tsk->pub_count++; tsk->published = 1; return 0; } static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq) { struct net *net = sock_net(&tsk->sk); struct publication *publ; struct publication *safe; int rc = -EINVAL; if (scope != TIPC_NODE_SCOPE) scope = TIPC_CLUSTER_SCOPE; list_for_each_entry_safe(publ, safe, &tsk->publications, binding_sock) { if (seq) { if (publ->scope != scope) continue; if (publ->type != seq->type) continue; if (publ->lower != seq->lower) continue; if (publ->upper != seq->upper) break; tipc_nametbl_withdraw(net, publ->type, publ->lower, publ->upper, publ->key); rc = 0; break; } tipc_nametbl_withdraw(net, publ->type, publ->lower, publ->upper, publ->key); rc = 0; } if (list_empty(&tsk->publications)) tsk->published = 0; return rc; } /* tipc_sk_reinit: set non-zero address in all existing sockets * when we go from standalone to network mode. */ void tipc_sk_reinit(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct rhashtable_iter iter; struct tipc_sock *tsk; struct tipc_msg *msg; rhashtable_walk_enter(&tn->sk_rht, &iter); do { rhashtable_walk_start(&iter); while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { sock_hold(&tsk->sk); rhashtable_walk_stop(&iter); lock_sock(&tsk->sk); msg = &tsk->phdr; msg_set_prevnode(msg, tipc_own_addr(net)); msg_set_orignode(msg, tipc_own_addr(net)); release_sock(&tsk->sk); rhashtable_walk_start(&iter); sock_put(&tsk->sk); } rhashtable_walk_stop(&iter); } while (tsk == ERR_PTR(-EAGAIN)); rhashtable_walk_exit(&iter); } static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_sock *tsk; rcu_read_lock(); tsk = rhashtable_lookup_fast(&tn->sk_rht, &portid, tsk_rht_params); if (tsk) sock_hold(&tsk->sk); rcu_read_unlock(); return tsk; } static int tipc_sk_insert(struct tipc_sock *tsk) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct tipc_net *tn = net_generic(net, tipc_net_id); u32 remaining = (TIPC_MAX_PORT - TIPC_MIN_PORT) + 1; u32 portid = prandom_u32() % remaining + TIPC_MIN_PORT; while (remaining--) { portid++; if ((portid < TIPC_MIN_PORT) || (portid > TIPC_MAX_PORT)) portid = TIPC_MIN_PORT; tsk->portid = portid; sock_hold(&tsk->sk); if (!rhashtable_lookup_insert_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) return 0; sock_put(&tsk->sk); } return -1; } static void tipc_sk_remove(struct tipc_sock *tsk) { struct sock *sk = &tsk->sk; struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id); if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) { WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } } static const struct rhashtable_params tsk_rht_params = { .nelem_hint = 192, .head_offset = offsetof(struct tipc_sock, node), .key_offset = offsetof(struct tipc_sock, portid), .key_len = sizeof(u32), /* portid */ .max_size = 1048576, .min_size = 256, .automatic_shrinking = true, }; int tipc_sk_rht_init(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); return rhashtable_init(&tn->sk_rht, &tsk_rht_params); } void tipc_sk_rht_destroy(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); /* Wait for socket readers to complete */ synchronize_net(); rhashtable_destroy(&tn->sk_rht); } static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) { struct net *net = sock_net(&tsk->sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq seq; int rc; if (mreq->type < TIPC_RESERVED_TYPES) return -EACCES; if (mreq->scope > TIPC_NODE_SCOPE) return -EINVAL; if (grp) return -EACCES; grp = tipc_group_create(net, tsk->portid, mreq, &tsk->group_is_open); if (!grp) return -ENOMEM; tsk->group = grp; msg_set_lookup_scope(hdr, mreq->scope); msg_set_nametype(hdr, mreq->type); msg_set_dest_droppable(hdr, true); seq.type = mreq->type; seq.lower = mreq->instance; seq.upper = seq.lower; tipc_nametbl_build_group(net, grp, mreq->type, mreq->scope); rc = tipc_sk_publish(tsk, mreq->scope, &seq); if (rc) { tipc_group_delete(net, grp); tsk->group = NULL; return rc; } /* Eliminate any risk that a broadcast overtakes sent JOINs */ tsk->mc_method.rcast = true; tsk->mc_method.mandatory = true; tipc_group_join(net, grp, &tsk->sk.sk_rcvbuf); return rc; } static int tipc_sk_leave(struct tipc_sock *tsk) { struct net *net = sock_net(&tsk->sk); struct tipc_group *grp = tsk->group; struct tipc_name_seq seq; int scope; if (!grp) return -EINVAL; tipc_group_self(grp, &seq, &scope); tipc_group_delete(net, grp); tsk->group = NULL; tipc_sk_withdraw(tsk, scope, &seq); return 0; } /** * tipc_setsockopt - set socket option * @sock: socket structure * @lvl: option level * @opt: option identifier * @ov: pointer to new option value * @ol: length of option value * * For stream sockets only, accepts and ignores all IPPROTO_TCP options * (to ease compatibility). * * Returns 0 on success, errno otherwise */ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, char __user *ov, unsigned int ol) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group_req mreq; u32 value = 0; int res = 0; if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) return 0; if (lvl != SOL_TIPC) return -ENOPROTOOPT; switch (opt) { case TIPC_IMPORTANCE: case TIPC_SRC_DROPPABLE: case TIPC_DEST_DROPPABLE: case TIPC_CONN_TIMEOUT: if (ol < sizeof(value)) return -EINVAL; if (get_user(value, (u32 __user *)ov)) return -EFAULT; break; case TIPC_GROUP_JOIN: if (ol < sizeof(mreq)) return -EINVAL; if (copy_from_user(&mreq, ov, sizeof(mreq))) return -EFAULT; break; default: if (ov || ol) return -EINVAL; } lock_sock(sk); switch (opt) { case TIPC_IMPORTANCE: res = tsk_set_importance(tsk, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) tsk_set_unreliable(tsk, value); else res = -ENOPROTOOPT; break; case TIPC_DEST_DROPPABLE: tsk_set_unreturnable(tsk, value); break; case TIPC_CONN_TIMEOUT: tipc_sk(sk)->conn_timeout = value; break; case TIPC_MCAST_BROADCAST: tsk->mc_method.rcast = false; tsk->mc_method.mandatory = true; break; case TIPC_MCAST_REPLICAST: tsk->mc_method.rcast = true; tsk->mc_method.mandatory = true; break; case TIPC_GROUP_JOIN: res = tipc_sk_join(tsk, &mreq); break; case TIPC_GROUP_LEAVE: res = tipc_sk_leave(tsk); break; default: res = -EINVAL; } release_sock(sk); return res; } /** * tipc_getsockopt - get socket option * @sock: socket structure * @lvl: option level * @opt: option identifier * @ov: receptacle for option value * @ol: receptacle for length of option value * * For stream sockets only, returns 0 length result for all IPPROTO_TCP options * (to ease compatibility). * * Returns 0 on success, errno otherwise */ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, char __user *ov, int __user *ol) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct tipc_name_seq seq; int len, scope; u32 value; int res; if ((lvl == IPPROTO_TCP) && (sock->type == SOCK_STREAM)) return put_user(0, ol); if (lvl != SOL_TIPC) return -ENOPROTOOPT; res = get_user(len, ol); if (res) return res; lock_sock(sk); switch (opt) { case TIPC_IMPORTANCE: value = tsk_importance(tsk); break; case TIPC_SRC_DROPPABLE: value = tsk_unreliable(tsk); break; case TIPC_DEST_DROPPABLE: value = tsk_unreturnable(tsk); break; case TIPC_CONN_TIMEOUT: value = tsk->conn_timeout; /* no need to set "res", since already 0 at this point */ break; case TIPC_NODE_RECVQ_DEPTH: value = 0; /* was tipc_queue_size, now obsolete */ break; case TIPC_SOCK_RECVQ_DEPTH: value = skb_queue_len(&sk->sk_receive_queue); break; case TIPC_GROUP_JOIN: seq.type = 0; if (tsk->group) tipc_group_self(tsk->group, &seq, &scope); value = seq.type; break; default: res = -EINVAL; } release_sock(sk); if (res) return res; /* "get" failed */ if (len < sizeof(value)) return -EINVAL; if (copy_to_user(ov, &value, sizeof(value))) return -EFAULT; return put_user(sizeof(value), ol); } static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct net *net = sock_net(sock->sk); struct tipc_sioc_nodeid_req nr = {0}; struct tipc_sioc_ln_req lnr; void __user *argp = (void __user *)arg; switch (cmd) { case SIOCGETLINKNAME: if (copy_from_user(&lnr, argp, sizeof(lnr))) return -EFAULT; if (!tipc_node_get_linkname(net, lnr.bearer_id & 0xffff, lnr.peer, lnr.linkname, TIPC_MAX_LINK_NAME)) { if (copy_to_user(argp, &lnr, sizeof(lnr))) return -EFAULT; return 0; } return -EADDRNOTAVAIL; case SIOCGETNODEID: if (copy_from_user(&nr, argp, sizeof(nr))) return -EFAULT; if (!tipc_node_get_id(net, nr.peer, nr.node_id)) return -EADDRNOTAVAIL; if (copy_to_user(argp, &nr, sizeof(nr))) return -EFAULT; return 0; default: return -ENOIOCTLCMD; } } static int tipc_socketpair(struct socket *sock1, struct socket *sock2) { struct tipc_sock *tsk2 = tipc_sk(sock2->sk); struct tipc_sock *tsk1 = tipc_sk(sock1->sk); u32 onode = tipc_own_addr(sock_net(sock1->sk)); tsk1->peer.family = AF_TIPC; tsk1->peer.addrtype = TIPC_ADDR_ID; tsk1->peer.scope = TIPC_NODE_SCOPE; tsk1->peer.addr.id.ref = tsk2->portid; tsk1->peer.addr.id.node = onode; tsk2->peer.family = AF_TIPC; tsk2->peer.addrtype = TIPC_ADDR_ID; tsk2->peer.scope = TIPC_NODE_SCOPE; tsk2->peer.addr.id.ref = tsk1->portid; tsk2->peer.addr.id.node = onode; tipc_sk_finish_conn(tsk1, tsk2->portid, onode); tipc_sk_finish_conn(tsk2, tsk1->portid, onode); return 0; } /* Protocol switches for the various types of TIPC sockets */ static const struct proto_ops msg_ops = { .owner = THIS_MODULE, .family = AF_TIPC, .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, .socketpair = tipc_socketpair, .accept = sock_no_accept, .getname = tipc_getname, .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = sock_no_listen, .shutdown = tipc_shutdown, .setsockopt = tipc_setsockopt, .getsockopt = tipc_getsockopt, .sendmsg = tipc_sendmsg, .recvmsg = tipc_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage }; static const struct proto_ops packet_ops = { .owner = THIS_MODULE, .family = AF_TIPC, .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, .setsockopt = tipc_setsockopt, .getsockopt = tipc_getsockopt, .sendmsg = tipc_send_packet, .recvmsg = tipc_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage }; static const struct proto_ops stream_ops = { .owner = THIS_MODULE, .family = AF_TIPC, .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, .poll = tipc_poll, .ioctl = tipc_ioctl, .listen = tipc_listen, .shutdown = tipc_shutdown, .setsockopt = tipc_setsockopt, .getsockopt = tipc_getsockopt, .sendmsg = tipc_sendstream, .recvmsg = tipc_recvstream, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage }; static const struct net_proto_family tipc_family_ops = { .owner = THIS_MODULE, .family = AF_TIPC, .create = tipc_sk_create }; static struct proto tipc_proto = { .name = "TIPC", .owner = THIS_MODULE, .obj_size = sizeof(struct tipc_sock), .sysctl_rmem = sysctl_tipc_rmem }; /** * tipc_socket_init - initialize TIPC socket interface * * Returns 0 on success, errno otherwise */ int tipc_socket_init(void) { int res; res = proto_register(&tipc_proto, 1); if (res) { pr_err("Failed to register TIPC protocol type\n"); goto out; } res = sock_register(&tipc_family_ops); if (res) { pr_err("Failed to register TIPC socket type\n"); proto_unregister(&tipc_proto); goto out; } out: return res; } /** * tipc_socket_stop - stop TIPC socket interface */ void tipc_socket_stop(void) { sock_unregister(tipc_family_ops.family); proto_unregister(&tipc_proto); } /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) { u32 peer_node; u32 peer_port; struct nlattr *nest; peer_node = tsk_peer_node(tsk); peer_port = tsk_peer_port(tsk); nest = nla_nest_start(skb, TIPC_NLA_SOCK_CON); if (nla_put_u32(skb, TIPC_NLA_CON_NODE, peer_node)) goto msg_full; if (nla_put_u32(skb, TIPC_NLA_CON_SOCK, peer_port)) goto msg_full; if (tsk->conn_type != 0) { if (nla_put_flag(skb, TIPC_NLA_CON_FLAG)) goto msg_full; if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, tsk->conn_type)) goto msg_full; if (nla_put_u32(skb, TIPC_NLA_CON_INST, tsk->conn_instance)) goto msg_full; } nla_nest_end(skb, nest); return 0; msg_full: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int __tipc_nl_add_sk_info(struct sk_buff *skb, struct tipc_sock *tsk) { struct net *net = sock_net(skb->sk); struct sock *sk = &tsk->sk; if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid) || nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tipc_own_addr(net))) return -EMSGSIZE; if (tipc_sk_connected(sk)) { if (__tipc_nl_add_sk_con(skb, tsk)) return -EMSGSIZE; } else if (!list_empty(&tsk->publications)) { if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL)) return -EMSGSIZE; } return 0; } /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk) { struct nlattr *attrs; void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET); if (!hdr) goto msg_cancel; attrs = nla_nest_start(skb, TIPC_NLA_SOCK); if (!attrs) goto genlmsg_cancel; if (__tipc_nl_add_sk_info(skb, tsk)) goto attr_msg_cancel; nla_nest_end(skb, attrs); genlmsg_end(skb, hdr); return 0; attr_msg_cancel: nla_nest_cancel(skb, attrs); genlmsg_cancel: genlmsg_cancel(skb, hdr); msg_cancel: return -EMSGSIZE; } int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, int (*skb_handler)(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk)) { struct rhashtable_iter *iter = (void *)cb->args[4]; struct tipc_sock *tsk; int err; rhashtable_walk_start(iter); while ((tsk = rhashtable_walk_next(iter)) != NULL) { if (IS_ERR(tsk)) { err = PTR_ERR(tsk); if (err == -EAGAIN) { err = 0; continue; } break; } sock_hold(&tsk->sk); rhashtable_walk_stop(iter); lock_sock(&tsk->sk); err = skb_handler(skb, cb, tsk); if (err) { release_sock(&tsk->sk); sock_put(&tsk->sk); goto out; } release_sock(&tsk->sk); rhashtable_walk_start(iter); sock_put(&tsk->sk); } rhashtable_walk_stop(iter); out: return skb->len; } EXPORT_SYMBOL(tipc_nl_sk_walk); int tipc_dump_start(struct netlink_callback *cb) { return __tipc_dump_start(cb, sock_net(cb->skb->sk)); } EXPORT_SYMBOL(tipc_dump_start); int __tipc_dump_start(struct netlink_callback *cb, struct net *net) { /* tipc_nl_name_table_dump() uses cb->args[0...3]. */ struct rhashtable_iter *iter = (void *)cb->args[4]; struct tipc_net *tn = tipc_net(net); if (!iter) { iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[4] = (long)iter; } rhashtable_walk_enter(&tn->sk_rht, iter); return 0; } int tipc_dump_done(struct netlink_callback *cb) { struct rhashtable_iter *hti = (void *)cb->args[4]; rhashtable_walk_exit(hti); kfree(hti); return 0; } EXPORT_SYMBOL(tipc_dump_done); int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk, u32 sk_filter_state, u64 (*tipc_diag_gen_cookie)(struct sock *sk)) { struct sock *sk = &tsk->sk; struct nlattr *attrs; struct nlattr *stat; /*filter response w.r.t sk_state*/ if (!(sk_filter_state & (1 << sk->sk_state))) return 0; attrs = nla_nest_start(skb, TIPC_NLA_SOCK); if (!attrs) goto msg_cancel; if (__tipc_nl_add_sk_info(skb, tsk)) goto attr_msg_cancel; if (nla_put_u32(skb, TIPC_NLA_SOCK_TYPE, (u32)sk->sk_type) || nla_put_u32(skb, TIPC_NLA_SOCK_TIPC_STATE, (u32)sk->sk_state) || nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) || nla_put_u32(skb, TIPC_NLA_SOCK_UID, from_kuid_munged(sk_user_ns(NETLINK_CB(cb->skb).sk), sock_i_uid(sk))) || nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE, tipc_diag_gen_cookie(sk), TIPC_NLA_SOCK_PAD)) goto attr_msg_cancel; stat = nla_nest_start(skb, TIPC_NLA_SOCK_STAT); if (!stat) goto attr_msg_cancel; if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ, skb_queue_len(&sk->sk_receive_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, skb_queue_len(&sk->sk_write_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, atomic_read(&sk->sk_drops))) goto stat_msg_cancel; if (tsk->cong_link_cnt && nla_put_flag(skb, TIPC_NLA_SOCK_STAT_LINK_CONG)) goto stat_msg_cancel; if (tsk_conn_cong(tsk) && nla_put_flag(skb, TIPC_NLA_SOCK_STAT_CONN_CONG)) goto stat_msg_cancel; nla_nest_end(skb, stat); if (tsk->group) if (tipc_group_fill_sock_diag(tsk->group, skb)) goto stat_msg_cancel; nla_nest_end(skb, attrs); return 0; stat_msg_cancel: nla_nest_cancel(skb, stat); attr_msg_cancel: nla_nest_cancel(skb, attrs); msg_cancel: return -EMSGSIZE; } EXPORT_SYMBOL(tipc_sk_fill_sock_diag); int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) { return tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk); } /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk_publ(struct sk_buff *skb, struct netlink_callback *cb, struct publication *publ) { void *hdr; struct nlattr *attrs; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_PUBL_GET); if (!hdr) goto msg_cancel; attrs = nla_nest_start(skb, TIPC_NLA_PUBL); if (!attrs) goto genlmsg_cancel; if (nla_put_u32(skb, TIPC_NLA_PUBL_KEY, publ->key)) goto attr_msg_cancel; if (nla_put_u32(skb, TIPC_NLA_PUBL_TYPE, publ->type)) goto attr_msg_cancel; if (nla_put_u32(skb, TIPC_NLA_PUBL_LOWER, publ->lower)) goto attr_msg_cancel; if (nla_put_u32(skb, TIPC_NLA_PUBL_UPPER, publ->upper)) goto attr_msg_cancel; nla_nest_end(skb, attrs); genlmsg_end(skb, hdr); return 0; attr_msg_cancel: nla_nest_cancel(skb, attrs); genlmsg_cancel: genlmsg_cancel(skb, hdr); msg_cancel: return -EMSGSIZE; } /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_list_sk_publ(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk, u32 *last_publ) { int err; struct publication *p; if (*last_publ) { list_for_each_entry(p, &tsk->publications, binding_sock) { if (p->key == *last_publ) break; } if (list_entry_is_head(p, &tsk->publications, binding_sock)) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the last NLMSG_DONE message * having the NLM_F_DUMP_INTR flag set. */ cb->prev_seq = 1; *last_publ = 0; return -EPIPE; } } else { p = list_first_entry(&tsk->publications, struct publication, binding_sock); } list_for_each_entry_from(p, &tsk->publications, binding_sock) { err = __tipc_nl_add_sk_publ(skb, cb, p); if (err) { *last_publ = p->key; return err; } } *last_publ = 0; return 0; } int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; u32 tsk_portid = cb->args[0]; u32 last_publ = cb->args[1]; u32 done = cb->args[2]; struct net *net = sock_net(skb->sk); struct tipc_sock *tsk; if (!tsk_portid) { struct nlattr **attrs; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; err = tipc_nlmsg_parse(cb->nlh, &attrs); if (err) return err; if (!attrs[TIPC_NLA_SOCK]) return -EINVAL; err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], tipc_nl_sock_policy, NULL); if (err) return err; if (!sock[TIPC_NLA_SOCK_REF]) return -EINVAL; tsk_portid = nla_get_u32(sock[TIPC_NLA_SOCK_REF]); } if (done) return 0; tsk = tipc_sk_lookup(net, tsk_portid); if (!tsk) return -EINVAL; lock_sock(&tsk->sk); err = __tipc_nl_list_sk_publ(skb, cb, tsk, &last_publ); if (!err) done = 1; release_sock(&tsk->sk); sock_put(&tsk->sk); cb->args[0] = tsk_portid; cb->args[1] = last_publ; cb->args[2] = done; return skb->len; } |
4248 4333 4310 4250 4229 4142 1674 4265 325 1472 80 80 4354 4313 4316 7 7 7 2 3061 937 3459 9 2 4229 283 283 283 4229 283 283 283 4230 4227 283 283 283 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 | /* * Copyright (C) 2001 Jens Axboe <axboe@suse.de> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public Licens * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- */ #ifndef __LINUX_BIO_H #define __LINUX_BIO_H #include <linux/highmem.h> #include <linux/mempool.h> #include <linux/ioprio.h> #include <linux/bug.h> #ifdef CONFIG_BLOCK #include <asm/io.h> /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include <linux/blk_types.h> #define BIO_DEBUG #ifdef BIO_DEBUG #define BIO_BUG_ON BUG_ON #else #define BIO_BUG_ON #endif #ifdef CONFIG_THP_SWAP #if HPAGE_PMD_NR > 256 #define BIO_MAX_PAGES HPAGE_PMD_NR #else #define BIO_MAX_PAGES 256 #endif #else #define BIO_MAX_PAGES 256 #endif #define bio_prio(bio) (bio)->bi_ioprio #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) #define bio_iter_iovec(bio, iter) \ bvec_iter_bvec((bio)->bi_io_vec, (iter)) #define bio_iter_page(bio, iter) \ bvec_iter_page((bio)->bi_io_vec, (iter)) #define bio_iter_len(bio, iter) \ bvec_iter_len((bio)->bi_io_vec, (iter)) #define bio_iter_offset(bio, iter) \ bvec_iter_offset((bio)->bi_io_vec, (iter)) #define bio_page(bio) bio_iter_page((bio), (bio)->bi_iter) #define bio_offset(bio) bio_iter_offset((bio), (bio)->bi_iter) #define bio_iovec(bio) bio_iter_iovec((bio), (bio)->bi_iter) #define bio_multiple_segments(bio) \ ((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len) #define bvec_iter_sectors(iter) ((iter).bi_size >> 9) #define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter))) #define bio_sectors(bio) bvec_iter_sectors((bio)->bi_iter) #define bio_end_sector(bio) bvec_iter_end_sector((bio)->bi_iter) /* * Return the data direction, READ or WRITE. */ #define bio_data_dir(bio) \ (op_is_write(bio_op(bio)) ? WRITE : READ) /* * Check whether this bio carries any data or not. A NULL bio is allowed. */ static inline bool bio_has_data(struct bio *bio) { if (bio && bio->bi_iter.bi_size && bio_op(bio) != REQ_OP_DISCARD && bio_op(bio) != REQ_OP_SECURE_ERASE && bio_op(bio) != REQ_OP_WRITE_ZEROES) return true; return false; } static inline bool bio_no_advance_iter(struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || bio_op(bio) == REQ_OP_WRITE_SAME || bio_op(bio) == REQ_OP_WRITE_ZEROES; } static inline bool bio_mergeable(struct bio *bio) { if (bio->bi_opf & REQ_NOMERGE_FLAGS) return false; return true; } static inline unsigned int bio_cur_bytes(struct bio *bio) { if (bio_has_data(bio)) return bio_iovec(bio).bv_len; else /* dataless requests such as discard */ return bio->bi_iter.bi_size; } static inline void *bio_data(struct bio *bio) { if (bio_has_data(bio)) return page_address(bio_page(bio)) + bio_offset(bio); return NULL; } static inline bool bio_full(struct bio *bio) { return bio->bi_vcnt >= bio->bi_max_vecs; } /* * will die */ #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) /* * merge helpers etc */ /* Default implementation of BIOVEC_PHYS_MERGEABLE */ #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) /* * allow arch override, for eg virtualized architectures (put in asm/io.h) */ #ifndef BIOVEC_PHYS_MERGEABLE #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ __BIOVEC_PHYS_MERGEABLE(vec1, vec2) #endif #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ (((addr1) | (mask)) == (((addr2) - 1) | (mask))) #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q))) /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it */ #define bio_for_each_segment_all(bvl, bio, i) \ for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++) static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, unsigned bytes) { iter->bi_sector += bytes >> 9; if (bio_no_advance_iter(bio)) { iter->bi_size -= bytes; iter->bi_done += bytes; } else { bvec_iter_advance(bio->bi_io_vec, iter, bytes); /* TODO: It is reasonable to complete bio with error here. */ } } static inline bool bio_rewind_iter(struct bio *bio, struct bvec_iter *iter, unsigned int bytes) { iter->bi_sector -= bytes >> 9; if (bio_no_advance_iter(bio)) { iter->bi_size += bytes; iter->bi_done -= bytes; return true; } return bvec_iter_rewind(bio->bi_io_vec, iter, bytes); } #define __bio_for_each_segment(bvl, bio, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bio_iter_iovec((bio), (iter))), 1); \ bio_advance_iter((bio), &(iter), (bvl).bv_len)) #define bio_for_each_segment(bvl, bio, iter) \ __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter) #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) static inline unsigned bio_segments(struct bio *bio) { unsigned segs = 0; struct bio_vec bv; struct bvec_iter iter; /* * We special case discard/write same/write zeroes, because they * interpret bi_size differently: */ switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: return 0; case REQ_OP_WRITE_SAME: return 1; default: break; } bio_for_each_segment(bv, bio, iter) segs++; return segs; } /* * get a reference to a bio, so it won't disappear. the intended use is * something like: * * bio_get(bio); * submit_bio(rw, bio); * if (bio->bi_flags ...) * do_something * bio_put(bio); * * without the bio_get(), it could potentially complete I/O before submit_bio * returns. and then bio would be freed memory when if (bio->bi_flags ...) * runs */ static inline void bio_get(struct bio *bio) { bio->bi_flags |= (1 << BIO_REFFED); smp_mb__before_atomic(); atomic_inc(&bio->__bi_cnt); } static inline void bio_cnt_set(struct bio *bio, unsigned int count) { if (count != 1) { bio->bi_flags |= (1 << BIO_REFFED); smp_mb(); } atomic_set(&bio->__bi_cnt, count); } static inline bool bio_flagged(struct bio *bio, unsigned int bit) { return (bio->bi_flags & (1U << bit)) != 0; } static inline void bio_set_flag(struct bio *bio, unsigned int bit) { bio->bi_flags |= (1U << bit); } static inline void bio_clear_flag(struct bio *bio, unsigned int bit) { bio->bi_flags &= ~(1U << bit); } static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) { *bv = bio_iovec(bio); } static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) { struct bvec_iter iter = bio->bi_iter; int idx; if (unlikely(!bio_multiple_segments(bio))) { *bv = bio_iovec(bio); return; } bio_advance_iter(bio, &iter, iter.bi_size); if (!iter.bi_bvec_done) idx = iter.bi_idx - 1; else /* in the middle of bvec */ idx = iter.bi_idx; *bv = bio->bi_io_vec[idx]; /* * iter.bi_bvec_done records actual length of the last bvec * if this bio ends in the middle of one io vector */ if (iter.bi_bvec_done) bv->bv_len = iter.bi_bvec_done; } static inline unsigned bio_pages_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); return bio->bi_vcnt; } static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); return bio->bi_io_vec; } static inline struct page *bio_first_page_all(struct bio *bio) { return bio_first_bvec_all(bio)->bv_page; } static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); return &bio->bi_io_vec[bio->bi_vcnt - 1]; } enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; /* * bio integrity payload */ struct bio_integrity_payload { struct bio *bip_bio; /* parent bio */ struct bvec_iter bip_iter; unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ struct work_struct bip_work; /* I/O completion */ struct bio_vec *bip_vec; struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; #if defined(CONFIG_BLK_DEV_INTEGRITY) static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { if (bio->bi_opf & REQ_INTEGRITY) return bio->bi_integrity; return NULL; } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip) return bip->bip_flags & flag; return false; } static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) { return bip->bip_iter.bi_sector; } static inline void bip_set_seed(struct bio_integrity_payload *bip, sector_t seed) { bip->bip_iter.bi_sector = seed; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ extern void bio_trim(struct bio *bio, int offset, int size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); /** * bio_next_split - get next @sectors from a bio, splitting if necessary * @bio: bio to split * @sectors: number of sectors to split from the front of @bio * @gfp: gfp mask * @bs: bio set to allocate from * * Returns a bio representing the next @sectors of @bio - if the bio is smaller * than @sectors, returns the original bio unchanged. */ static inline struct bio *bio_next_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) { if (sectors >= bio_sectors(bio)) return bio; return bio_split(bio, sectors, gfp, bs); } enum { BIOSET_NEED_BVECS = BIT(0), BIOSET_NEED_RESCUER = BIT(1), }; extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags); extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); } static inline struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); } extern blk_qc_t submit_bio(struct bio *); extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } static inline void bio_wouldblock_error(struct bio *bio) { bio->bi_status = BLK_STS_AGAIN; bio_endio(bio); } struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); extern int submit_bio_wait(struct bio *bio); extern void bio_advance(struct bio *, unsigned); extern void bio_init(struct bio *bio, struct bio_vec *table, unsigned short max_vecs); extern void bio_uninit(struct bio *); extern void bio_reset(struct bio *); void bio_chain(struct bio *, struct bio *); extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); bool __bio_try_merge_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, struct iov_iter *, gfp_t); extern void bio_unmap_user(struct bio *); extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, gfp_t); extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, gfp_t, int); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part); void generic_end_io_acct(struct request_queue *q, int op, struct hd_struct *part, unsigned long start_time); #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" #endif #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE extern void bio_flush_dcache_pages(struct bio *bi); #else static inline void bio_flush_dcache_pages(struct bio *bi) { } #endif extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_list_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); extern struct bio *bio_copy_user_iov(struct request_queue *, struct rq_map_data *, struct iov_iter *, gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); static inline void zero_fill_bio(struct bio *bio) { zero_fill_bio_iter(bio, bio->bi_iter); } extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ do { \ if ((bio)->bi_disk != (bdev)->bd_disk) \ bio_clear_flag(bio, BIO_THROTTLED);\ (bio)->bi_disk = (bdev)->bd_disk; \ (bio)->bi_partno = (bdev)->bd_partno; \ } while (0) #define bio_copy_dev(dst, src) \ do { \ (dst)->bi_disk = (src)->bi_disk; \ (dst)->bi_partno = (src)->bi_partno; \ } while (0) #define bio_dev(bio) \ disk_devt((bio)->bi_disk) #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) int bio_associate_blkcg_from_page(struct bio *bio, struct page *page); #else static inline int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) { return 0; } #endif #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg); void bio_disassociate_task(struct bio *bio); void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } static inline void bio_disassociate_task(struct bio *bio) { } static inline void bio_clone_blkcg_association(struct bio *dst, struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM /* * remember never ever reenable interrupts between a bvec_kmap_irq and * bvec_kunmap_irq! */ static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) { unsigned long addr; /* * might not be a highmem page, but the preempt/irq count * balancing is a lot nicer this way */ local_irq_save(*flags); addr = (unsigned long) kmap_atomic(bvec->bv_page); BUG_ON(addr & ~PAGE_MASK); return (char *) addr + bvec->bv_offset; } static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) { unsigned long ptr = (unsigned long) buffer & PAGE_MASK; kunmap_atomic((void *) ptr); local_irq_restore(*flags); } #else static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) { return page_address(bvec->bv_page) + bvec->bv_offset; } static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) { *flags = 0; } #endif /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * * A bio_list anchors a singly-linked list of bios chained through the bi_next * member of the bio. The bio_list also caches the last list member to allow * fast access to the tail. */ struct bio_list { struct bio *head; struct bio *tail; }; static inline int bio_list_empty(const struct bio_list *bl) { return bl->head == NULL; } static inline void bio_list_init(struct bio_list *bl) { bl->head = bl->tail = NULL; } #define BIO_EMPTY_LIST { NULL, NULL } #define bio_list_for_each(bio, bl) \ for (bio = (bl)->head; bio; bio = bio->bi_next) static inline unsigned bio_list_size(const struct bio_list *bl) { unsigned sz = 0; struct bio *bio; bio_list_for_each(bio, bl) sz++; return sz; } static inline void bio_list_add(struct bio_list *bl, struct bio *bio) { bio->bi_next = NULL; if (bl->tail) bl->tail->bi_next = bio; else bl->head = bio; bl->tail = bio; } static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio) { bio->bi_next = bl->head; bl->head = bio; if (!bl->tail) bl->tail = bio; } static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) { if (!bl2->head) return; if (bl->tail) bl->tail->bi_next = bl2->head; else bl->head = bl2->head; bl->tail = bl2->tail; } static inline void bio_list_merge_head(struct bio_list *bl, struct bio_list *bl2) { if (!bl2->head) return; if (bl->head) bl2->tail->bi_next = bl->head; else bl->tail = bl2->tail; bl->head = bl2->head; } static inline struct bio *bio_list_peek(struct bio_list *bl) { return bl->head; } static inline struct bio *bio_list_pop(struct bio_list *bl) { struct bio *bio = bl->head; if (bio) { bl->head = bl->head->bi_next; if (!bl->head) bl->tail = NULL; bio->bi_next = NULL; } return bio; } static inline struct bio *bio_list_get(struct bio_list *bl) { struct bio *bio = bl->head; bl->head = bl->tail = NULL; return bio; } /* * Increment chain count for the bio. Make sure the CHAIN flag update * is visible before the raised count. */ static inline void bio_inc_remaining(struct bio *bio) { bio_set_flag(bio, BIO_CHAIN); smp_mb__before_atomic(); atomic_inc(&bio->__bi_remaining); } /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. * These memory pools in turn all allocate from the bio_slab * and the bvec_slabs[]. */ #define BIO_POOL_SIZE 2 struct bio_set { struct kmem_cache *bio_slab; unsigned int front_pad; mempool_t bio_pool; mempool_t bvec_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t bio_integrity_pool; mempool_t bvec_integrity_pool; #endif /* * Deadlock avoidance for stacking block drivers: see comments in * bio_alloc_bioset() for details */ spinlock_t rescue_lock; struct bio_list rescue_list; struct work_struct rescue_work; struct workqueue_struct *rescue_workqueue; }; struct biovec_slab { int nr_vecs; char *name; struct kmem_cache *slab; }; static inline bool bioset_initialized(struct bio_set *bs) { return bs->bio_slab != NULL; } /* * a small number of entries is fine, not going to be performance critical. * basically we just need to survive */ #define BIO_SPLIT_ENTRIES 2 #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_for_each_vec(bvl, bip, iter) \ for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) #define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_prep(struct bio *); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *); extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t); extern int bioset_integrity_create(struct bio_set *, int); extern void bioset_integrity_free(struct bio_set *); extern void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline void *bio_integrity(struct bio *bio) { return NULL; } static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) { return 0; } static inline void bioset_integrity_free (struct bio_set *bs) { return; } static inline bool bio_integrity_prep(struct bio *bio) { return true; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { return 0; } static inline void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { return; } static inline void bio_integrity_trim(struct bio *bio) { return; } static inline void bio_integrity_init(void) { return; } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; } static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, unsigned int nr) { return ERR_PTR(-EINVAL); } static inline int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLOCK */ #endif /* __LINUX_BIO_H */ |
48 48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 | /* Helpers for initial module or kernel cmdline parsing Copyright (C) 2001 Rusty Russell. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/ctype.h> #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ static DEFINE_MUTEX(param_lock); /* Use the module's mutex, or if built-in use the built-in mutex */ #ifdef CONFIG_MODULES #define KPARAM_MUTEX(mod) ((mod) ? &(mod)->param_lock : ¶m_lock) #else #define KPARAM_MUTEX(mod) (¶m_lock) #endif static inline void check_kparam_locked(struct module *mod) { BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod))); } #else static inline void check_kparam_locked(struct module *mod) { } #endif /* !CONFIG_SYSFS */ /* This just allows us to keep track of which parameters are kmalloced. */ struct kmalloced_param { struct list_head list; char val[]; }; static LIST_HEAD(kmalloced_params); static DEFINE_SPINLOCK(kmalloced_params_lock); static void *kmalloc_parameter(unsigned int size) { struct kmalloced_param *p; p = kmalloc(sizeof(*p) + size, GFP_KERNEL); if (!p) return NULL; spin_lock(&kmalloced_params_lock); list_add(&p->list, &kmalloced_params); spin_unlock(&kmalloced_params_lock); return p->val; } /* Does nothing if parameter wasn't kmalloced above. */ static void maybe_kfree_parameter(void *param) { struct kmalloced_param *p; spin_lock(&kmalloced_params_lock); list_for_each_entry(p, &kmalloced_params, list) { if (p->val == param) { list_del(&p->list); kfree(p); break; } } spin_unlock(&kmalloced_params_lock); } static char dash2underscore(char c) { if (c == '-') return '_'; return c; } bool parameqn(const char *a, const char *b, size_t n) { size_t i; for (i = 0; i < n; i++) { if (dash2underscore(a[i]) != dash2underscore(b[i])) return false; } return true; } bool parameq(const char *a, const char *b) { return parameqn(a, b, strlen(a)+1); } static void param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { pr_notice("Setting dangerous option %s - tainting kernel\n", kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } } static int parse_one(char *param, char *val, const char *doing, const struct kernel_param *params, unsigned num_params, s16 min_level, s16 max_level, void *arg, int (*handle_unknown)(char *param, char *val, const char *doing, void *arg)) { unsigned int i; int err; /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { if (params[i].level < min_level || params[i].level > max_level) return 0; /* No one handled NULL, so do it here. */ if (!val && !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); kernel_param_lock(params[i].mod); param_check_unsafe(¶ms[i]); err = params[i].ops->set(val, ¶ms[i]); kernel_param_unlock(params[i].mod); return err; } } if (handle_unknown) { pr_debug("doing %s: %s='%s'\n", doing, param, val); return handle_unknown(param, val, doing, arg); } pr_debug("Unknown argument '%s'\n", param); return -ENOENT; } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ char *parse_args(const char *doing, char *args, const struct kernel_param *params, unsigned num, s16 min_level, s16 max_level, void *arg, int (*unknown)(char *param, char *val, const char *doing, void *arg)) { char *param, *val, *err = NULL; /* Chew leading spaces */ args = skip_spaces(args); if (*args) pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); while (*args) { int ret; int irq_was_disabled; args = next_arg(args, ¶m, &val); /* Stop at -- */ if (!val && strcmp(param, "--") == 0) return err ?: args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, arg, unknown); if (irq_was_disabled && !irqs_disabled()) pr_warn("%s: option '%s' enabled irq's!\n", doing, param); switch (ret) { case 0: continue; case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); break; case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); break; } err = ERR_PTR(ret); } return err; } /* Lazy bastard, eh? */ #define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ return strtolfn(val, 0, (type *)kp->arg); \ } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ return scnprintf(buffer, PAGE_SIZE, format "\n", \ *((type *)kp->arg)); \ } \ const struct kernel_param_ops param_ops_##name = { \ .set = param_set_##name, \ .get = param_get_##name, \ }; \ EXPORT_SYMBOL(param_set_##name); \ EXPORT_SYMBOL(param_get_##name); \ EXPORT_SYMBOL(param_ops_##name) STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); STANDARD_PARAM_DEF(long, long, "%li", kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { pr_err("%s: string parameter too long\n", kp->name); return -ENOSPC; } maybe_kfree_parameter(*(char **)kp->arg); /* This is a hack. We can't kmalloc in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); if (!*(char **)kp->arg) return -ENOMEM; strcpy(*(char **)kp->arg, val); } else *(const char **)kp->arg = val; return 0; } EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, .get = param_get_charp, .free = param_free_charp, }; EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, const struct kernel_param *kp) { /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ return strtobool(val, kp->arg); } EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { /* Y and N chosen as being relatively non-coder friendly */ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); const struct kernel_param_ops param_ops_bool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; EXPORT_SYMBOL(param_ops_bool); int param_set_bool_enable_only(const char *val, const struct kernel_param *kp) { int err = 0; bool new_value; bool orig_value = *(bool *)kp->arg; struct kernel_param dummy_kp = *kp; dummy_kp.arg = &new_value; err = param_set_bool(val, &dummy_kp); if (err) return err; /* Don't let them unset it once it's set! */ if (!new_value && orig_value) return -EROFS; if (new_value) err = param_set_bool(val, kp); return err; } EXPORT_SYMBOL_GPL(param_set_bool_enable_only); const struct kernel_param_ops param_ops_bool_enable_only = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; EXPORT_SYMBOL_GPL(param_ops_bool_enable_only); /* This one must be bool. */ int param_set_invbool(const char *val, const struct kernel_param *kp) { int ret; bool boolval; struct kernel_param dummy; dummy.arg = &boolval; ret = param_set_bool(val, &dummy); if (ret == 0) *(bool *)kp->arg = !boolval; return ret; } EXPORT_SYMBOL(param_set_invbool); int param_get_invbool(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); } EXPORT_SYMBOL(param_get_invbool); const struct kernel_param_ops param_ops_invbool = { .set = param_set_invbool, .get = param_get_invbool, }; EXPORT_SYMBOL(param_ops_invbool); int param_set_bint(const char *val, const struct kernel_param *kp) { /* Match bool exactly, by re-using it. */ struct kernel_param boolkp = *kp; bool v; int ret; boolkp.arg = &v; ret = param_set_bool(val, &boolkp); if (ret == 0) *(int *)kp->arg = v; return ret; } EXPORT_SYMBOL(param_set_bint); const struct kernel_param_ops param_ops_bint = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; EXPORT_SYMBOL(param_ops_bint); /* We break the rule and mangle the string. */ static int param_array(struct module *mod, const char *name, const char *val, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, const struct kernel_param *kp), s16 level, unsigned int *num) { int ret; struct kernel_param kp; char save; /* Get the name right for errors. */ kp.name = name; kp.arg = elem; kp.level = level; *num = 0; /* We expect a comma-separated list of values. */ do { int len; if (*num == max) { pr_err("%s: can only take %i arguments\n", name, max); return -EINVAL; } len = strcspn(val, ","); /* nul-terminate and parse */ save = val[len]; ((char *)val)[len] = '\0'; check_kparam_locked(mod); ret = set(val, &kp); if (ret != 0) return ret; kp.arg += elemsize; val += len+1; (*num)++; } while (save == ','); if (*num < min) { pr_err("%s: needs at least %i arguments\n", name, min); return -EINVAL; } return 0; } static int param_array_set(const char *val, const struct kernel_param *kp) { const struct kparam_array *arr = kp->arr; unsigned int temp_num; return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem, arr->elemsize, arr->ops->set, kp->level, arr->num ?: &temp_num); } static int param_array_get(char *buffer, const struct kernel_param *kp) { int i, off, ret; const struct kparam_array *arr = kp->arr; struct kernel_param p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { /* Replace \n with comma */ if (i) buffer[off - 1] = ','; p.arg = arr->elem + arr->elemsize * i; check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); if (ret < 0) return ret; off += ret; } buffer[off] = '\0'; return off; } static void param_array_free(void *arg) { unsigned int i; const struct kparam_array *arr = arg; if (arr->ops->free) for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) arr->ops->free(arr->elem + arr->elemsize * i); } const struct kernel_param_ops param_array_ops = { .set = param_array_set, .get = param_array_get, .free = param_array_free, }; EXPORT_SYMBOL(param_array_ops); int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; if (strlen(val)+1 > kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; } strcpy(kps->string, val); return 0; } EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); } EXPORT_SYMBOL(param_get_string); const struct kernel_param_ops param_ops_string = { .set = param_set_copystring, .get = param_get_string, }; EXPORT_SYMBOL(param_ops_string); /* sysfs output in /sys/modules/XYZ/parameters/ */ #define to_module_attr(n) container_of(n, struct module_attribute, attr) #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) struct param_attribute { struct module_attribute mattr; const struct kernel_param *param; }; struct module_param_attrs { unsigned int num; struct attribute_group grp; struct param_attribute attrs[0]; }; #ifdef CONFIG_SYSFS #define to_param_attr(n) container_of(n, struct param_attribute, mattr) static ssize_t param_attr_show(struct module_attribute *mattr, struct module_kobject *mk, char *buf) { int count; struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->ops->get) return -EPERM; kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); kernel_param_unlock(mk->mod); return count; } /* sysfs always hands a nul-terminated string in buf. We rely on that. */ static ssize_t param_attr_store(struct module_attribute *mattr, struct module_kobject *mk, const char *buf, size_t len) { int err; struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->ops->set) return -EPERM; kernel_param_lock(mk->mod); param_check_unsafe(attribute->param); err = attribute->param->ops->set(buf, attribute->param); kernel_param_unlock(mk->mod); if (!err) return len; return err; } #endif #ifdef CONFIG_MODULES #define __modinit #else #define __modinit __init #endif #ifdef CONFIG_SYSFS void kernel_param_lock(struct module *mod) { mutex_lock(KPARAM_MUTEX(mod)); } void kernel_param_unlock(struct module *mod) { mutex_unlock(KPARAM_MUTEX(mod)); } EXPORT_SYMBOL(kernel_param_lock); EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject * @kp: the actual parameter definition to add to sysfs * @name: name of parameter * * Create a kobject if for a (per-module) parameter if mp NULL, and * create file in sysfs. Returns an error on out of memory. Always cleans up * if there's an error. */ static __modinit int add_sysfs_param(struct module_kobject *mk, const struct kernel_param *kp, const char *name) { struct module_param_attrs *new_mp; struct attribute **new_attrs; unsigned int i; /* We don't bother calling this with invisible parameters. */ BUG_ON(!kp->perm); if (!mk->mp) { /* First allocation. */ mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); if (!mk->mp) return -ENOMEM; mk->mp->grp.name = "parameters"; /* NULL-terminated attribute array. */ mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL); /* Caller will cleanup via free_module_param_attrs */ if (!mk->mp->grp.attrs) return -ENOMEM; } /* Enlarge allocations. */ new_mp = krealloc(mk->mp, sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), GFP_KERNEL); if (!new_mp) return -ENOMEM; mk->mp = new_mp; /* Extra pointer for NULL terminator */ new_attrs = krealloc(mk->mp->grp.attrs, sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), GFP_KERNEL); if (!new_attrs) return -ENOMEM; mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); mk->mp->attrs[mk->mp->num].param = kp; mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; else mk->mp->attrs[mk->mp->num].mattr.store = NULL; mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; mk->mp->num++; /* Fix up all the pointers, since krealloc can move us */ for (i = 0; i < mk->mp->num; i++) mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; mk->mp->grp.attrs[mk->mp->num] = NULL; return 0; } #ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { if (mk->mp) kfree(mk->mp->grp.attrs); kfree(mk->mp); mk->mp = NULL; } /* * module_param_sysfs_setup - setup sysfs support for one module * @mod: module * @kparam: module parameters (array) * @num_params: number of module parameters * * Adds sysfs entries for module parameters under * /sys/module/[mod->name]/parameters/ */ int module_param_sysfs_setup(struct module *mod, const struct kernel_param *kparam, unsigned int num_params) { int i, err; bool params = false; for (i = 0; i < num_params; i++) { if (kparam[i].perm == 0) continue; err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); if (err) { free_module_param_attrs(&mod->mkobj); return err; } params = true; } if (!params) return 0; /* Create the param group. */ err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); if (err) free_module_param_attrs(&mod->mkobj); return err; } /* * module_param_sysfs_remove - remove sysfs support for one module * @mod: module * * Remove sysfs entries for module parameters and the corresponding * kobject. */ void module_param_sysfs_remove(struct module *mod) { if (mod->mkobj.mp) { sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); /* We are positive that no one is using any param * attrs at this point. Deallocate immediately. */ free_module_param_attrs(&mod->mkobj); } } #endif void destroy_params(const struct kernel_param *params, unsigned num) { unsigned int i; for (i = 0; i < num; i++) if (params[i].ops->free) params[i].ops->free(params[i].arg); } static struct module_kobject * __init locate_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; int err; kobj = kset_find_obj(module_kset, name); if (kobj) { mk = to_module_kobject(kobj); } else { mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); mk->mod = THIS_MODULE; mk->kobj.kset = module_kset; err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); #ifdef CONFIG_MODULES if (!err) err = sysfs_create_file(&mk->kobj, &module_uevent.attr); #endif if (err) { kobject_put(&mk->kobj); pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", name, err); return NULL; } /* So that we hold reference in both cases. */ kobject_get(&mk->kobj); } return mk; } static void __init kernel_add_sysfs_param(const char *name, const struct kernel_param *kparam, unsigned int name_skip) { struct module_kobject *mk; int err; mk = locate_module_kobject(name); if (!mk) return; /* We need to remove old parameters before adding more. */ if (mk->mp) sysfs_remove_group(&mk->kobj, &mk->mp->grp); /* These should not fail at boot. */ err = add_sysfs_param(mk, kparam, kparam->name + name_skip); BUG_ON(err); err = sysfs_create_group(&mk->kobj, &mk->mp->grp); BUG_ON(err); kobject_uevent(&mk->kobj, KOBJ_ADD); kobject_put(&mk->kobj); } /* * param_sysfs_builtin - add sysfs parameters for built-in modules * * Add module_parameters to sysfs for "modules" built into the kernel. * * The "module" name (KBUILD_MODNAME) is stored before a dot, the * "parameter" name is stored behind a dot in kernel_param->name. So, * extract the "module" name for all built-in kernel_param-eters, * and for all who have the same, call kernel_add_sysfs_param. */ static void __init param_sysfs_builtin(void) { const struct kernel_param *kp; unsigned int name_len; char modname[MODULE_NAME_LEN]; for (kp = __start___param; kp < __stop___param; kp++) { char *dot; if (kp->perm == 0) continue; dot = strchr(kp->name, '.'); if (!dot) { /* This happens for core_param() */ strcpy(modname, "kernel"); name_len = 0; } else { name_len = dot - kp->name + 1; strlcpy(modname, kp->name, name_len); } kernel_add_sysfs_param(modname, kp, name_len); } } ssize_t __modver_version_show(struct module_attribute *mattr, struct module_kobject *mk, char *buf) { struct module_version_attribute *vattr = container_of(mattr, struct module_version_attribute, mattr); return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); } extern const struct module_version_attribute *__start___modver[]; extern const struct module_version_attribute *__stop___modver[]; static void __init version_sysfs_builtin(void) { const struct module_version_attribute **p; struct module_kobject *mk; int err; for (p = __start___modver; p < __stop___modver; p++) { const struct module_version_attribute *vattr = *p; mk = locate_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); kobject_uevent(&mk->kobj, KOBJ_ADD); kobject_put(&mk->kobj); } } } /* module-related sysfs stuff */ static ssize_t module_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct module_attribute *attribute; struct module_kobject *mk; int ret; attribute = to_module_attr(attr); mk = to_module_kobject(kobj); if (!attribute->show) return -EIO; ret = attribute->show(attribute, mk, buf); return ret; } static ssize_t module_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct module_attribute *attribute; struct module_kobject *mk; int ret; attribute = to_module_attr(attr); mk = to_module_kobject(kobj); if (!attribute->store) return -EIO; ret = attribute->store(attribute, mk, buf, len); return ret; } static const struct sysfs_ops module_sysfs_ops = { .show = module_attr_show, .store = module_attr_store, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) { struct kobj_type *ktype = get_ktype(kobj); if (ktype == &module_ktype) return 1; return 0; } static const struct kset_uevent_ops module_uevent_ops = { .filter = uevent_filter, }; struct kset *module_kset; int module_sysfs_initialized; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); complete(mk->kobj_completion); } struct kobj_type module_ktype = { .release = module_kobj_release, .sysfs_ops = &module_sysfs_ops, }; /* * param_sysfs_init - wrapper for built-in params support */ static int __init param_sysfs_init(void) { module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); if (!module_kset) { printk(KERN_WARNING "%s (%d): error creating kset\n", __FILE__, __LINE__); return -ENOMEM; } module_sysfs_initialized = 1; version_sysfs_builtin(); param_sysfs_builtin(); return 0; } subsys_initcall(param_sysfs_init); #endif /* CONFIG_SYSFS */ |
46 376 337 2714 2714 2713 45 45 45 45 45 45 45 46 38 38 38 20 38 38 38 38 38 38 38 38 45 4867 4869 4867 4868 78 78 74 74 74 68 68 68 68 174 10 2570 2573 2573 55 38 99 2568 10 2567 99 96 2542 18 250 2432 2432 147 1 67 8 2420 2407 2420 43 73 539 533 68 533 217 216 207 217 120 120 120 102 398 397 130 130 123 235 357 234 124 46 24 46 2115 2056 1194 2115 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/balloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" #include "mballoc.h" #include <trace/events/ext4.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ /* * Calculate block group number for a given block number */ ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block) { ext4_group_t group; if (test_opt2(sb, STD_GROUP_SIZE)) group = (block - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); else ext4_get_group_no_and_offset(sb, block, &group, NULL); return group; } /* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) *blockgrpp = blocknr; } /* * Check whether the 'block' lives within the 'block_group'. Returns 1 if so * and 0 otherwise. */ static inline int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, ext4_group_t block_group) { ext4_group_t actual_group; actual_group = ext4_get_group_number(sb, block); return (actual_group == block_group) ? 1 : 0; } /* Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned num_clusters; int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); ext4_fsblk_t itbl_blk; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ num_clusters = ext4_num_base_meta_clusters(sb, block_group); /* * For the allocation bitmaps and inode table, we first need * to check to see if the block is in the block group. If it * is, then check to see if the cluster is already accounted * for in the clusters used for the base metadata cluster, or * if we can increment the base metadata cluster to include * that block. Otherwise, we will have to track the cluster * used for the allocation bitmap or inode table explicitly. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); if (block_cluster < num_clusters) block_cluster = -1; else if (block_cluster == num_clusters) { num_clusters++; block_cluster = -1; } } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); if (inode_cluster < num_clusters) inode_cluster = -1; else if (inode_cluster == num_clusters) { num_clusters++; inode_cluster = -1; } } itbl_blk = ext4_inode_table(sb, gdp); for (i = 0; i < sbi->s_itb_per_group; i++) { if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { c = EXT4_B2C(sbi, itbl_blk + i - start); if ((c < num_clusters) || (c == inode_cluster) || (c == block_cluster) || (c == itbl_cluster)) continue; if (c == num_clusters) { num_clusters++; continue; } num_clusters++; itbl_cluster = c; } } if (block_cluster != -1) num_clusters++; if (inode_cluster != -1) num_clusters++; return num_clusters; } static unsigned int num_clusters_in_group(struct super_block *sb, ext4_group_t block_group) { unsigned int blocks; if (block_group == ext4_get_groups_count(sb) - 1) { /* * Even though mke2fs always initializes the first and * last group, just in case some other tool was used, * we need to make sure we calculate the right free * blocks. */ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, block_group); } else blocks = EXT4_BLOCKS_PER_GROUP(sb); return EXT4_NUM_B2C(EXT4_SB(sb), blocks); } /* Initializes an uninitialized block bitmap */ static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); if ((bit_max >> 3) >= bh->b_size) return -EFSCORRUPTED; for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); start = ext4_group_first_block_no(sb, block_group); /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } /* * Also if the number of blocks within the group is less than * the blocksize * 8 ( which is the size of bitmap ), set rest * of the block bitmap to 1 */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); return 0; } /* Return the number of free blocks in a block group. It is used when * the block bitmap is uninitialized, so we can't just count the bits * in the bitmap. */ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } /* * The free blocks are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. The descriptors are loaded in memory * when a file system is mounted (see ext4_fill_super). */ /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block * @block_group: given block group * @bh: pointer to the buffer head to store the block * group descriptor */ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { unsigned int group_desc; unsigned int offset; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); return NULL; } group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); /* * sbi_array_rcu_deref returns with rcu unlocked, this is ok since * the pointer being dereferenced won't be dereferenced again. By * looking at the usage in add_new_gdb() the value isn't modified, * just the pointer, and so it remains valid. */ if (!bh_p) { ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } desc = (struct ext4_group_desc *)( (__u8 *)bh_p->b_data + offset * EXT4_DESC_SIZE(sb)); if (bh) *bh = bh_p; return desc; } /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; if (ext4_has_feature_flex_bg(sb)) { /* with FLEX_BG, the inode/block bitmaps and itable * blocks may not be in the group at all * so the bitmap validation will be skipped for those groups * or it has to also read the block group where the bitmaps * are located to verify they are set. */ return 0; } group_first_block = ext4_group_first_block_no(sb, block_group); /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group), EXT4_B2C(sbi, offset)); if (next_zero_bit < EXT4_B2C(sbi, offset + sbi->s_itb_per_group)) /* bad bitmap for inode tables */ return blk; return 0; } static int ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /** * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * * Read the bitmap for a given block_group,and validate the * bits for block/inode/inode tables are set in the bitmaps * * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; ext4_fsblk_t bitmap_blk; int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot get buffer for block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Block bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); if (err) { ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } goto verify; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* Returns 0 on success, 1 on error */ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_group_desc *desc; if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ return ext4_validate_block_bitmap(sb, desc, block_group, bh); } struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct buffer_head *bh; int err; bh = ext4_read_block_bitmap_nowait(sb, block_group); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); if (err) { put_bh(bh); return ERR_PTR(err); } return bh; } /** * ext4_has_free_clusters() * @sbi: in-core super block structure. * @nclusters: number of needed blocks * @flags: flags from ext4_mb_new_blocks() * * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + resv_clusters; if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) return 1; } /* No free blocks. Let's see if we can dip into reserved pool */ if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } return 0; } int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { if (ext4_has_free_clusters(sbi, nclusters, flags)) { percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; } /** * ext4_should_retry_alloc() * @sb: super block * @retries number of attemps has been made * * ext4_should_retry_alloc() is called when ENOSPC is returned, and if * it is profitable to retry the operation, this function will wait * for the current or committing transaction to complete, and then * return TRUE. We will only retry once. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) || (*retries)++ > 1 || !EXT4_SB(sb)->s_journal) return 0; smp_mb(); if (EXT4_SB(sb)->s_mb_free_pending == 0) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); return 1; } /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; memset(&ar, 0, sizeof(ar)); /* Fill with neighbour allocated blocks */ ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) *count = ar.len; /* * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } /** * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * * Adds up the number of free clusters from each block group. */ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_info *grp; #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; #endif } static inline int test_root(ext4_group_t a, int b) { while (1) { if (a < b) return 0; if (a == b) return 1; if ((a % b) != 0) return 0; a = a / b; } } /** * ext4_bg_has_super - number of blocks used by the superblock in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the superblock (primary or backup) * in this group. Currently this will be only 0 or 1. */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (group == 0) return 1; if (ext4_has_feature_sparse_super2(sb)) { if (group == le32_to_cpu(es->s_backup_bgs[0]) || group == le32_to_cpu(es->s_backup_bgs[1])) return 1; return 0; } if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) return 1; if (!(group & 1)) return 0; if (test_root(group, 3) || (test_root(group, 5)) || test_root(group, 7)) return 1; return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, ext4_group_t group) { unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; if (group == first || group == first + 1 || group == last) return 1; return 0; } static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { if (!ext4_bg_has_super(sb, group)) return 0; if (ext4_has_feature_meta_bg(sb)) return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); else return EXT4_SB(sb)->s_gdb_count; } /** * ext4_bg_num_gdb - number of blocks used by the group table in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the group descriptor table * (primary or backup) in this group. In the future there may be a * different number of descriptor blocks in each group. */ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) { unsigned long first_meta_bg = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); } /* * This function returns the number of file system metadata clusters at * the beginning of a block group, including the reserved gdt blocks. */ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); if (!ext4_has_feature_meta_bg(sb) || block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { num += ext4_bg_num_gdb(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb(sb, block_group); } return EXT4_NUM_B2C(sbi, num); } /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation * * Return the ideal location to start allocating blocks for a * newly created inode. */ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_group_t block_group; ext4_grpblk_t colour; int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); ext4_fsblk_t bg_start; ext4_fsblk_t last_block; block_group = ei->i_block_group; if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { /* * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME * block groups per flexgroup, reserve the first block * group for directories and special files. Regular * files will start at the second block group. This * tends to speed up directory access and improves * fsck times. */ block_group &= ~(flex_size-1); if (S_ISREG(inode->i_mode)) block_group++; } bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* * If we are doing delayed allocation, we don't need take * colour into account. */ if (test_opt(inode->i_sb, DELALLOC)) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (current->pid % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else colour = (current->pid % 16) * ((last_block - bg_start) / 16); return bg_start + colour; } |
1 1 1 6 16 16 11 11 8 11 11 11 11 11 11 3 11 11 11 11 11 11 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | /* * mac80211 ethtool hooks for cfg80211 * * Copied from cfg.c - originally * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014 Intel Corporation (Author: Johannes Berg) * Copyright (C) 2018 Intel Corporation * * This file is GPLv2 as found in COPYING. */ #include <linux/types.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" static int ieee80211_set_ringparam(struct net_device *dev, struct ethtool_ringparam *rp) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); } static void ieee80211_get_ringparam(struct net_device *dev, struct ethtool_ringparam *rp) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); memset(rp, 0, sizeof(*rp)); drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, &rp->rx_pending, &rp->rx_max_pending); } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) static int ieee80211_get_sset_count(struct net_device *dev, int sset) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int rv = 0; if (sset == ETH_SS_STATS) rv += STA_STATS_LEN; rv += drv_get_et_sset_count(sdata, sset); if (rv == 0) return -EOPNOTSUPP; return rv; } static void ieee80211_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; struct sta_info *sta; struct ieee80211_local *local = sdata->local; struct station_info sinfo; struct survey_info survey; int i, q; #define STA_STATS_SURVEY_LEN 7 memset(data, 0, sizeof(u64) * STA_STATS_LEN); #define ADD_STA_STATS(sta) \ do { \ data[i++] += sta->rx_stats.packets; \ data[i++] += sta->rx_stats.bytes; \ data[i++] += sta->rx_stats.num_duplicates; \ data[i++] += sta->rx_stats.fragments; \ data[i++] += sta->rx_stats.dropped; \ \ data[i++] += sinfo.tx_packets; \ data[i++] += sinfo.tx_bytes; \ data[i++] += sta->status_stats.filtered; \ data[i++] += sta->status_stats.retry_failed; \ data[i++] += sta->status_stats.retry_count; \ } while (0) /* For Managed stations, find the single station based on BSSID * and use that. For interface types, iterate through all available * stations and add stats for any station that is assigned to this * network device. */ mutex_lock(&local->sta_mtx); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sta = sta_info_get_bss(sdata, sdata->u.mgd.bssid); if (!(sta && !WARN_ON(sta->sdata->dev != dev))) goto do_survey; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(sta); data[i++] = sta->sta_state; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { list_for_each_entry(sta, &local->sta_list, list) { /* Make sure this station belongs to the proper dev */ if (sta->sdata->dev != dev) continue; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(sta); } } do_survey: i = STA_STATS_LEN - STA_STATS_SURVEY_LEN; /* Get survey stats for current channel */ survey.filled = 0; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); if (chanctx_conf) channel = chanctx_conf->def.chan; else channel = NULL; rcu_read_unlock(); if (channel) { q = 0; do { survey.filled = 0; if (drv_get_survey(local, q, &survey) != 0) { survey.filled = 0; break; } q++; } while (channel != survey.channel); } if (survey.filled) data[i++] = survey.channel->center_freq; else data[i++] = 0; if (survey.filled & SURVEY_INFO_NOISE_DBM) data[i++] = (u8)survey.noise; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME) data[i++] = survey.time; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_BUSY) data[i++] = survey.time_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_EXT_BUSY) data[i++] = survey.time_ext_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_RX) data[i++] = survey.time_rx; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_TX) data[i++] = survey.time_tx; else data[i++] = -1LL; mutex_unlock(&local->sta_mtx); if (WARN_ON(i != STA_STATS_LEN)) return; drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); } static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int sz_sta_stats = 0; if (sset == ETH_SS_STATS) { sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats); memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats); } drv_get_et_strings(sdata, sset, &(data[sz_sta_stats])); } static int ieee80211_get_regs_len(struct net_device *dev) { return 0; } static void ieee80211_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; regs->version = wdev->wiphy->hw_version; regs->len = 0; } const struct ethtool_ops ieee80211_ethtool_ops = { .get_drvinfo = cfg80211_get_drvinfo, .get_regs_len = ieee80211_get_regs_len, .get_regs = ieee80211_get_regs, .get_link = ethtool_op_get_link, .get_ringparam = ieee80211_get_ringparam, .set_ringparam = ieee80211_set_ringparam, .get_strings = ieee80211_get_strings, .get_ethtool_stats = ieee80211_get_stats, .get_sset_count = ieee80211_get_sset_count, }; |
102 53 127 48 51 50 48 74 74 67 68 59 101 127 479 479 479 49 10 10 10 106 106 64 64 64 41 42 118 147 57 3 8 50 8 57 220 222 220 22 22 2 8 53 53 4 4 53 39 41 18 18 26 29 29 29 29 26 26 5 4 20 36 37 36 17 37 44 23 12 12 12 12 12 12 10 10 23 1 8 2 2 2 2 1 1 8 11 11 6 6 11 11 1 1 11 11 6 6 11 11 1 1 11 16 16 15 1 16 14 13 13 14 13 15 16 15 13 198 94 92 92 92 198 214 214 214 214 3 3 3 2 195 195 195 194 214 45 214 35 36 36 35 23 23 23 69 42 2 6 6 32 40 32 40 2 1 40 25 15 39 36 26 26 25 28 8 8 8 23 21 28 25 39 39 39 39 39 15 39 25 51 52 16 36 52 52 200 120 200 81 81 47 81 47 34 39 39 25 11 39 45 170 170 62 22 22 12 52 10 10 12 3021 2555 30 30 19 17 3 15 15 52 3006 2329 2980 170 52 52 52 12 12 40 57 5 57 57 57 34 23 23 3001 3023 3008 292 13 253 13 482 2810 2808 12 41 1 7 412 411 412 14 49 48 49 49 21 6 6 45 28 21 21 20 20 20 20 7 20 20 7 6 14 1 680 679 680 680 680 680 679 679 680 1 1 8 1 1 14 14 14 13 7 4 4 3 3 9 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 | /* * xfrm_policy.c * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor * */ #include <linux/err.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/cpu.h> #include <linux/audit.h> #include <net/dst.h> #include <net/flow.h> #include <net/xfrm.h> #include <net/ip.h> #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif #include "xfrm_hash.h" #define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10)) #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 struct xfrm_flo { struct dst_entry *dst_orig; u8 flags; }; static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; static __read_mostly seqcount_t xfrm_policy_hash_generation; static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); } static inline bool __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; return addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) && addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) && (fl4->flowi4_proto == sel->proto || !sel->proto) && (fl4->flowi4_oif == sel->ifindex || !sel->ifindex); } static inline bool __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi6 *fl6 = &fl->u.ip6; return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) && addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) && (fl6->flowi6_proto == sel->proto || !sel->proto) && (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); } bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_selector_match(sel, fl); case AF_INET6: return __xfrm6_selector_match(sel, fl); } return false; } static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) { const struct xfrm_policy_afinfo *afinfo; if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_policy_afinfo[family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } /* Called with rcu_read_lock(). */ static const struct xfrm_if_cb *xfrm_if_get_cb(void) { return rcu_dereference(xfrm_if_cb); } struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family, u32 mark) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); rcu_read_unlock(); return dst; } EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family, u32 mark) { struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; xfrm_address_t *daddr = &x->id.daddr; struct dst_entry *dst; if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) { saddr = x->coaddr; daddr = prev_daddr; } if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) { saddr = prev_saddr; daddr = x->coaddr; } dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); if (!IS_ERR(dst)) { if (prev_saddr != saddr) memcpy(prev_saddr, saddr, sizeof(*prev_saddr)); if (prev_daddr != daddr) memcpy(prev_daddr, daddr, sizeof(*prev_daddr)); } return dst; } static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) return MAX_SCHEDULE_TIMEOUT-1; else return secs*HZ; } static void xfrm_policy_timer(struct timer_list *t) { struct xfrm_policy *xp = from_timer(xp, t, timer); time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; int dir; read_lock(&xp->lock); if (unlikely(xp->walk.dead)) goto out; dir = xfrm_policy_id2dir(xp->index); if (xp->lft.hard_add_expires_seconds) { time64_t tmo = xp->lft.hard_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + (xp->curlft.use_time ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.soft_add_expires_seconds) { time64_t tmo = xp->lft.soft_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + (xp->curlft.use_time ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (warn) km_policy_expired(xp, dir, 0, 0); if (next != TIME64_MAX && !mod_timer(&xp->timer, jiffies + make_jiffies(next))) xfrm_pol_hold(xp); out: read_unlock(&xp->lock); xfrm_pol_put(xp); return; expired: read_unlock(&xp->lock); if (!xfrm_policy_delete(xp, dir)) km_policy_expired(xp, dir, 1, 0); xfrm_pol_put(xp); } /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. */ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) { struct xfrm_policy *policy; policy = kzalloc(sizeof(struct xfrm_policy), gfp); if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); refcount_set(&policy->refcnt, 1); skb_queue_head_init(&policy->polq.hold_queue); timer_setup(&policy->timer, xfrm_policy_timer, 0); timer_setup(&policy->polq.hold_timer, xfrm_policy_queue_process, 0); } return policy; } EXPORT_SYMBOL(xfrm_policy_alloc); static void xfrm_policy_destroy_rcu(struct rcu_head *head) { struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); security_xfrm_policy_free(policy->security); kfree(policy); } /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); /* Rule must be locked. Release descendant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ static void xfrm_policy_kill(struct xfrm_policy *policy) { write_lock_bh(&policy->lock); policy->walk.dead = 1; write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); if (del_timer(&policy->polq.hold_timer)) xfrm_pol_put(policy); skb_queue_purge(&policy->polq.hold_queue); if (del_timer(&policy->timer)) xfrm_pol_put(policy); xfrm_pol_put(policy); } static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; static inline unsigned int idx_hash(struct net *net, u32 index) { return __idx_hash(index, net->xfrm.policy_idx_hmask); } /* calculate policy hash thresholds */ static void __get_hash_thresh(struct net *net, unsigned short family, int dir, u8 *dbits, u8 *sbits) { switch (family) { case AF_INET: *dbits = net->xfrm.policy_bydst[dir].dbits4; *sbits = net->xfrm.policy_bydst[dir].sbits4; break; case AF_INET6: *dbits = net->xfrm.policy_bydst[dir].dbits6; *sbits = net->xfrm.policy_bydst[dir].sbits6; break; default: *dbits = 0; *sbits = 0; } } static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) return &net->xfrm.policy_inexact[dir]; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static struct hlist_head *policy_hash_direct(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static void xfrm_dst_hash_transfer(struct net *net, struct hlist_head *list, struct hlist_head *ndsttable, unsigned int nhashmask, int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; u8 dbits; u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); if (!entry0) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; } else { if (h != h0) continue; hlist_del_rcu(&pol->bydst); hlist_add_behind_rcu(&pol->bydst, entry0); } entry0 = &pol->bydst; } if (!hlist_empty(list)) { entry0 = NULL; goto redo; } } static void xfrm_idx_hash_transfer(struct hlist_head *list, struct hlist_head *nidxtable, unsigned int nhashmask) { struct hlist_node *tmp; struct xfrm_policy *pol; hlist_for_each_entry_safe(pol, tmp, list, byidx) { unsigned int h; h = __idx_hash(pol->index, nhashmask); hlist_add_head(&pol->byidx, nidxtable+h); } } static unsigned long xfrm_new_hash_mask(unsigned int old_hmask) { return ((old_hmask + 1) << 1) - 1; } static void xfrm_bydst_resize(struct net *net, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *ndst = xfrm_hash_alloc(nsize); struct hlist_head *odst; int i; if (!ndst) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; write_seqcount_end(&xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } static void xfrm_byidx_resize(struct net *net, int total) { unsigned int hmask = net->xfrm.policy_idx_hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *oidx = net->xfrm.policy_byidx; struct hlist_head *nidx = xfrm_hash_alloc(nsize); int i; if (!nidx) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total) { unsigned int cnt = net->xfrm.policy_count[dir]; unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; if (total) *total += cnt; if ((hmask + 1) < xfrm_policy_hashmax && cnt > hmask) return 1; return 0; } static inline int xfrm_byidx_should_resize(struct net *net, int total) { unsigned int hmask = net->xfrm.policy_idx_hmask; if ((hmask + 1) < xfrm_policy_hashmax && total > hmask) return 1; return 0; } void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; } EXPORT_SYMBOL(xfrm_spd_getinfo); static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hash_work); int dir, total; mutex_lock(&hash_resize_mutex); total = 0; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if (xfrm_bydst_should_resize(net, dir, &total)) xfrm_bydst_resize(net, dir); } if (xfrm_byidx_should_resize(net, total)) xfrm_byidx_resize(net, total); mutex_unlock(&hash_resize_mutex); } static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); unsigned int hmask; struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; struct hlist_head *odst; struct hlist_node *newpos; int i; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; mutex_lock(&hash_resize_mutex); /* read selector prefixlen thresholds */ do { seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); lbits4 = net->xfrm.policy_hthresh.lbits4; rbits4 = net->xfrm.policy_hthresh.rbits4; lbits6 = net->xfrm.policy_hthresh.lbits6; rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); hmask = net->xfrm.policy_bydst[dir].hmask; odst = net->xfrm.policy_bydst[dir].table; for (i = hmask; i >= 0; i--) INIT_HLIST_HEAD(odst + i); if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; net->xfrm.policy_bydst[dir].sbits4 = lbits4; net->xfrm.policy_bydst[dir].dbits6 = rbits6; net->xfrm.policy_bydst[dir].sbits6 = lbits6; } else { /* dir in/fwd => dst = local, src = remote */ net->xfrm.policy_bydst[dir].dbits4 = lbits4; net->xfrm.policy_bydst[dir].sbits4 = rbits4; net->xfrm.policy_bydst[dir].dbits6 = lbits6; net->xfrm.policy_bydst[dir].sbits6 = rbits6; } } /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { if (policy->walk.dead || xfrm_policy_id2dir(policy->index) >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, policy->family, xfrm_policy_id2dir(policy->index)); hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; else break; } if (newpos) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } void xfrm_policy_hash_rebuild(struct net *net) { schedule_work(&net->xfrm.policy_hthresh.work); } EXPORT_SYMBOL(xfrm_policy_hash_rebuild); /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { static u32 idx_generator; for (;;) { struct hlist_head *list; struct xfrm_policy *p; u32 idx; int found; if (!index) { idx = (idx_generator | dir); idx_generator += 8; } else { idx = index; index = 0; } if (idx == 0) idx = 8; list = net->xfrm.policy_byidx + idx_hash(net, idx); found = 0; hlist_for_each_entry(p, list, byidx) { if (p->index == idx) { found = 1; break; } } if (!found) return idx; } } static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2) { u32 *p1 = (u32 *) s1; u32 *p2 = (u32 *) s2; int len = sizeof(struct xfrm_selector) / sizeof(u32); int i; for (i = 0; i < len; i++) { if (p1[i] != p2[i]) return 1; } return 0; } static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy *new) { struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; if (skb_queue_empty(&pq->hold_queue)) return; __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice_init(&pq->hold_queue, &list); if (del_timer(&pq->hold_timer)) xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice(&list, &pq->hold_queue); pq->timeout = XFRM_QUEUE_TMO_MIN; if (!mod_timer(&pq->hold_timer, jiffies)) xfrm_pol_hold(new); spin_unlock_bh(&pq->hold_queue.lock); } static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark, struct xfrm_policy *pol) { return mark->v == pol->mark.v && mark->m == pol->mark.m; } int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) { struct net *net = xp_net(policy); struct xfrm_policy *pol; struct xfrm_policy *delpol; struct hlist_head *chain; struct hlist_node *newpos; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); delpol = NULL; newpos = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return -EEXIST; } delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = &pol->bydst; continue; } if (delpol) break; } if (newpos) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) rt_genid_bump_ipv4(net); else rt_genid_bump_ipv6(net); if (delpol) { xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); } policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); policy->curlft.add_time = ktime_get_real_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); else if (xfrm_bydst_should_resize(net, dir, NULL)) schedule_work(&net->xfrm.policy_hash_work); return 0; } EXPORT_SYMBOL(xfrm_policy_insert); struct xfrm_policy * xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); ret = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol) && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete( pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; break; } } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy * xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, u32 id, int delete, int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; *err = -ENOENT; if (xfrm_policy_id2dir(id) != dir) return NULL; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete( pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; break; } } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); return ret; } EXPORT_SYMBOL(xfrm_policy_byid); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { int dir, err = 0; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy *pol; int i; hlist_for_each_entry(pol, &net->xfrm.policy_inexact[dir], bydst) { if (pol->type != type) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { hlist_for_each_entry(pol, net->xfrm.policy_bydst[dir].table + i, bydst) { if (pol->type != type) continue; err = security_xfrm_policy_delete( pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } } } return err; } #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) goto out; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy *pol; int i; again1: hlist_for_each_entry(pol, &net->xfrm.policy_inexact[dir], bydst) { if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again1; } for (i = net->xfrm.policy_bydst[dir].hmask; i >= 0; i--) { again2: hlist_for_each_entry(pol, net->xfrm.policy_bydst[dir].table + i, bydst) { if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again2; } } } if (!cnt) err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) { struct xfrm_policy *pol; struct xfrm_policy_walk_entry *x; int error = 0; if (walk->type >= XFRM_POLICY_TYPE_MAX && walk->type != XFRM_POLICY_TYPE_ANY) return -EINVAL; if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else x = list_first_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; pol = container_of(x, struct xfrm_policy, walk); if (walk->type != XFRM_POLICY_TYPE_ANY && walk->type != pol->type) continue; error = func(pol, xfrm_policy_id2dir(pol->index), walk->seq, data); if (error) { list_move_tail(&walk->walk.all, &x->all); goto out; } walk->seq++; } if (walk->seq == 0) { error = -ENOENT; goto out; } list_del_init(&walk->walk.all); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) { INIT_LIST_HEAD(&walk->walk.all); walk->walk.dead = 1; walk->type = type; walk->seq = 0; } EXPORT_SYMBOL(xfrm_policy_walk_init); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) { if (list_empty(&walk->walk.all)) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); /* * Find policy to apply to this flow. * * Returns 0 if policy found, else an -errno. */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, u8 type, u16 family, int dir, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; match = xfrm_selector_match(sel, fl, family); if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, dir); return ret; } static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { int err; struct xfrm_policy *pol, *ret; const xfrm_address_t *daddr, *saddr; struct hlist_head *chain; unsigned int sequence; u32 priority; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); if (unlikely(!daddr || !saddr)) return NULL; rcu_read_lock(); retry: do { sequence = read_seqcount_begin(&xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); priority = ~0U; ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; else { ret = ERR_PTR(err); goto fail; } } else { ret = pol; priority = ret->priority; break; } } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry_rcu(pol, chain, bydst) { if ((pol->priority >= priority) && ret) break; err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; else { ret = ERR_PTR(err); goto fail; } } else { ret = pol; break; } } if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) goto retry; if (ret && !xfrm_pol_hold_rcu(ret)) goto retry; fail: rcu_read_unlock(); return ret; } static struct xfrm_policy *xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir, if_id); if (pol != NULL) return pol; #endif return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl, u16 family, u32 if_id) { struct xfrm_policy *pol; rcu_read_lock(); again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { bool match; int err = 0; if (pol->family != family) { pol = NULL; goto out; } match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; } err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, dir); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; } else if (err == -ESRCH) { pol = NULL; } else { pol = ERR_PTR(err); } } else pol = NULL; } out: rcu_read_unlock(); return pol; } static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); } static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); if (list_empty(&pol->walk.all)) return NULL; /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); hlist_del(&pol->byidx); } list_del_init(&pol->walk.all); net->xfrm.policy_count[dir]--; return pol; } static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir) { __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir); } static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir) { __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir); } int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_policy_kill(pol); return 0; } return -ENOENT; } EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY if (pol && pol->type != XFRM_POLICY_TYPE_MAIN) return -EINVAL; #endif spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = ktime_get_real_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); /* Unlinking succeeds always. This is the only function * allowed to delete or replace socket policy. */ xfrm_sk_policy_unlink(old_pol, dir); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); } return 0; } static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) { struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC); struct net *net = xp_net(old); if (newp) { newp->selector = old->selector; if (security_xfrm_policy_clone(old->security, &newp->security)) { kfree(newp); return NULL; /* ENOMEM */ } newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; } int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { const struct xfrm_policy *p; struct xfrm_policy *np; int i, ret = 0; rcu_read_lock(); for (i = 0; i < 2; i++) { p = rcu_dereference(osk->sk_policy[i]); if (p) { np = clone_policy(p, i); if (unlikely(!np)) { ret = -ENOMEM; break; } rcu_assign_pointer(sk->sk_policy[i], np); } } rcu_read_unlock(); return ret; } static int xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, xfrm_address_t *remote, unsigned short family, u32 mark) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; err = afinfo->get_saddr(net, oif, local, remote, mark); rcu_read_unlock(); return err; } /* Resolve list of templates for the flow, given policy. */ static int xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct net *net = xp_net(policy); int nx; int i, error; xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; xfrm_address_t *remote = daddr; xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { error = xfrm_get_saddr(net, fl->flowi_oif, &tmp, remote, tmpl->encap_family, 0); if (error) goto fail; local = &tmp; } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; daddr = remote; saddr = local; continue; } if (x) { error = (x->km.state == XFRM_STATE_ERROR ? -EINVAL : -EAGAIN); xfrm_state_put(x); } else if (error == -ESRCH) { error = -EAGAIN; } if (!tmpl->optional) goto fail; } return nx; fail: for (nx--; nx >= 0; nx--) xfrm_state_put(xfrm[nx]); return error; } static int xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct xfrm_state *tp[XFRM_MAX_DEPTH]; struct xfrm_state **tpp = (npols > 1) ? tp : xfrm; int cnx = 0; int error; int ret; int i; for (i = 0; i < npols; i++) { if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) { error = -ENOBUFS; goto fail; } ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family); if (ret < 0) { error = ret; goto fail; } else cnx += ret; } /* found states are sorted for outbound processing */ if (npols > 1) xfrm_state_sort(xfrm, tpp, cnx, family); return cnx; fail: for (cnx--; cnx >= 0; cnx--) xfrm_state_put(tpp[cnx]); return error; } static int xfrm_get_tos(const struct flowi *fl, int family) { const struct xfrm_policy_afinfo *afinfo; int tos; afinfo = xfrm_policy_get_afinfo(family); if (!afinfo) return 0; tos = afinfo->get_tos(fl); rcu_read_unlock(); return tos; } static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_ops *dst_ops; struct xfrm_dst *xdst; if (!afinfo) return ERR_PTR(-EINVAL); switch (family) { case AF_INET: dst_ops = &net->xfrm.xfrm4_dst_ops; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: dst_ops = &net->xfrm.xfrm6_dst_ops; break; #endif default: BUG(); } xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { struct dst_entry *dst = &xdst->u.dst; memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); } else xdst = ERR_PTR(-ENOBUFS); rcu_read_unlock(); return xdst; } static inline int xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(dst->ops->family); int err; if (!afinfo) return -EINVAL; err = afinfo->init_path(path, dst, nfheader_len); rcu_read_unlock(); return err; } static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(xdst->u.dst.ops->family); int err; if (!afinfo) return -EINVAL; err = afinfo->fill_dst(xdst, dev, fl); rcu_read_unlock(); return err; } /* Allocate chain of dst_entry's, attach known xfrm's, calculate * all the metrics... Shortly, bundle a bundle. */ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, struct xfrm_dst **bundle, int nx, const struct flowi *fl, struct dst_entry *dst) { struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; struct xfrm_mode *inner_mode; struct xfrm_dst *xdst_prev = NULL; struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; int nfheader_len = 0; int trailer_len = 0; int tos; int family = policy->selector.family; xfrm_address_t saddr, daddr; xfrm_flowi_addr_get(fl, &saddr, &daddr, family); tos = xfrm_get_tos(fl, family); dst_hold(dst); for (; i < nx; i++) { struct xfrm_dst *xdst = xfrm_alloc_dst(net, family); struct dst_entry *dst1 = &xdst->u.dst; err = PTR_ERR(xdst); if (IS_ERR(xdst)) { dst_release(dst); goto put_states; } bundle[i] = xdst; if (!xdst_prev) xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], xfrm_af2proto(family)); if (!inner_mode) { err = -EAFNOSUPPORT; dst_release(dst); goto put_states; } } else inner_mode = xfrm[i]->inner_mode; xdst->route = dst; dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { __u32 mark = 0; if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; } else dst_hold(dst); dst1->xfrm = xfrm[i]; xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_HOST; dst1->lastuse = now; dst1->input = dst_discard; dst1->output = inner_mode->afinfo->output; xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) nfheader_len += xfrm[i]->props.header_len; trailer_len += xfrm[i]->props.trailer_len; } xfrm_dst_set_child(xdst_prev, dst); xdst0->path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; xfrm_init_path(xdst0, dst, nfheader_len); xfrm_init_pmtu(bundle, nx); for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; xdst_prev->u.dst.header_len = header_len; xdst_prev->u.dst.trailer_len = trailer_len; header_len -= xdst_prev->u.dst.xfrm->props.header_len; trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: if (xdst0) dst_release_immediate(&xdst0->u.dst); return ERR_PTR(err); } static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) { int i; if (*num_pols == 0 || !pols[0]) { *num_pols = 0; *num_xfrms = 0; return 0; } if (IS_ERR(pols[0])) { *num_pols = 0; return PTR_ERR(pols[0]); } *num_xfrms = pols[0]->xfrm_nr; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW && pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, XFRM_POLICY_OUT, pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); *num_pols = 0; return PTR_ERR(pols[1]); } (*num_pols)++; (*num_xfrms) += pols[1]->xfrm_nr; } } #endif for (i = 0; i < *num_pols; i++) { if (pols[i]->action != XFRM_POLICY_ALLOW) { *num_xfrms = -1; break; } } return 0; } static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, struct dst_entry *dst_orig) { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst; struct dst_entry *dst; int err; /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { if (err == 0) return NULL; if (err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); } xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); return xdst; } static void xfrm_policy_queue_process(struct timer_list *t) { struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); if (!skb) { spin_unlock(&pq->hold_queue.lock); goto out; } dst = skb_dst(skb); sk = skb->sk; xfrm_decode_session(skb, &fl, dst->ops->family); spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) goto purge_queue; if (dst->flags & DST_XFRM_QUEUE) { dst_release(dst); if (pq->timeout >= XFRM_QUEUE_TMO_MAX) goto purge_queue; pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); goto out; } dst_release(dst); __skb_queue_head_init(&list); spin_lock(&pq->hold_queue.lock); pq->timeout = 0; skb_queue_splice_init(&pq->hold_queue, &list); spin_unlock(&pq->hold_queue.lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; } nf_reset(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); dst_output(net, skb->sk, skb); } out: xfrm_pol_put(pol); return; purge_queue: pq->timeout = 0; skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) { kfree_skb(skb); return -EAGAIN; } skb_dst_force(skb); spin_lock_bh(&pq->hold_queue.lock); if (!pq->timeout) pq->timeout = XFRM_QUEUE_TMO_MIN; sched_next = jiffies + pq->timeout; if (del_timer(&pq->hold_timer)) { if (time_before(pq->hold_timer.expires, sched_next)) sched_next = pq->hold_timer.expires; xfrm_pol_put(pol); } __skb_queue_tail(&pq->hold_queue, skb); if (!mod_timer(&pq->hold_timer, sched_next)) xfrm_pol_hold(pol); spin_unlock_bh(&pq->hold_queue.lock); return 0; } static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, struct xfrm_flo *xflo, const struct flowi *fl, int num_xfrms, u16 family) { int err; struct net_device *dev; struct dst_entry *dst; struct dst_entry *dst1; struct xfrm_dst *xdst; xdst = xfrm_alloc_dst(net, family); if (IS_ERR(xdst)) return xdst; if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || net->xfrm.sysctl_larval_drop || num_xfrms <= 0) return xdst; dst = xflo->dst_orig; dst1 = &xdst->u.dst; dst_hold(dst); xdst->route = dst; dst_copy_metrics(dst1, dst); dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_HOST | DST_XFRM_QUEUE; dst1->lastuse = jiffies; dst1->input = dst_discard; dst1->output = xdst_queue_output; dst_hold(dst); xfrm_dst_set_child(xdst, dst); xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; err = xfrm_fill_dst(xdst, dev, fl); if (err) goto free_dst; out: return xdst; free_dst: dst_release(dst1); xdst = ERR_PTR(err); goto out; } static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; struct xfrm_dst *xdst; /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto inc_error; if (num_pols == 0) return NULL; if (num_xfrms <= 0) goto make_dummy_bundle; xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err == -EREMOTE) { xfrm_pols_put(pols, num_pols); return NULL; } if (err != -EAGAIN) goto error; goto make_dummy_bundle; } else if (xdst == NULL) { num_xfrms = 0; goto make_dummy_bundle; } return xdst; make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); } xdst->num_pols = num_pols; xdst->num_xfrms = num_xfrms; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); return xdst; inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_entry *ret; if (!afinfo) { dst_release(dst_orig); return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } rcu_read_unlock(); return ret; } /* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. * * xfrm_lookup uses an if_id of 0 by default, and is provided for * compatibility */ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; struct dst_entry *dst, *route; u16 family = dst_orig->ops->family; u8 dir = XFRM_POLICY_OUT; int i, err, num_pols, num_xfrms = 0, drop_pols = 0; dst = NULL; xdst = NULL; route = NULL; sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto dropdst; if (num_pols) { if (num_xfrms <= 0) { drop_pols = num_pols; goto no_transform; } xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); if (err == -EREMOTE) goto nopol; goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; drop_pols = num_pols; goto no_transform; } route = xdst->route; } } if (xdst == NULL) { struct xfrm_flo xflo; xflo.dst_orig = dst_orig; xflo.flags = flags; /* To accelerate a bit... */ if (!if_id && ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT])) goto nopol; xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { err = PTR_ERR(xdst); goto dropdst; } num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols); route = xdst->route; } dst = &xdst->u.dst; if (route == NULL && num_xfrms > 0) { /* The only case when xfrm_bundle_lookup() returns a * bundle with null route, is when the template could * not be resolved. It means policies are there, but * bundle could not be created, since we don't yet * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); err = -EREMOTE; goto error; } err = -EAGAIN; XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); goto error; } no_transform: if (num_pols == 0) goto nopol; if ((flags & XFRM_LOOKUP_ICMP) && !(pols[0]->flags & XFRM_POLICY_ICMP)) { err = -ENOENT; goto error; } for (i = 0; i < num_pols; i++) pols[i]->curlft.use_time = ktime_get_real_seconds(); if (num_xfrms < 0) { /* Prohibit the flow */ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; } else if (num_xfrms > 0) { /* Flow transformed */ dst_release(dst_orig); } else { /* Flow passes untransformed */ dst_release(dst); dst = dst_orig; } ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && dst->xfrm->props.mode == XFRM_MODE_TUNNEL) dst->flags |= DST_XFRM_TUNNEL; return dst; nopol: if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; } err = -ENOENT; error: dst_release(dst); dropdst: if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } EXPORT_SYMBOL(xfrm_lookup_with_ifid); /* Main function: finds/creates a bundle for given flow. * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. */ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); } EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). * Otherwise we may send out blackholed packets. */ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags | XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_KEEP_DST_REF); if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); if (IS_ERR(dst)) dst_release(dst_orig); return dst; } EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { struct xfrm_state *x; if (!skb->sp || idx < 0 || idx >= skb->sp->len) return 0; x = skb->sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); } /* When skb is transformed back to its "native" form, we have to * check policy restrictions. At the moment we make this in maximally * stupid way. Shame on me. :-) Of course, connected sockets must * have policy cached at them. */ static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); return x->id.proto == tmpl->id.proto && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && (x->props.reqid == tmpl->reqid || !tmpl->reqid) && x->props.mode == tmpl->mode && (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && xfrm_state_addr_cmp(tmpl, x, family)); } /* * 0 or more than 0 is returned when validation is succeeded (either bypass * because of optional transport mode, or next index of the mathced secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, unsigned short family) { int idx = start; if (tmpl->optional) { if (tmpl->mode == XFRM_MODE_TRANSPORT) return start; } else start = -1; for (; idx < sp->len; idx++) { if (xfrm_state_ok(tmpl, sp->xvec[idx], family)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { if (start == -1) start = -2-idx; break; } } return start; } int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); int err; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; afinfo->decode_session(skb, fl, reverse); err = security_xfrm_decode_session(skb, &fl->flowi_secid); rcu_read_unlock(); return err; } EXPORT_SYMBOL(__xfrm_decode_session); static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp) { for (; k < sp->len; k++) { if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) { *idxp = k; return 1; } } return 0; } int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct xfrm_policy *pol; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int npols = 0; int xfrm_nr; int pi; int reverse; struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; struct xfrm_if *xi; u32 if_id = 0; rcu_read_lock(); ifcb = xfrm_if_get_cb(); if (ifcb) { xi = ifcb->decode_session(skb, family); if (xi) { if_id = xi->p.if_id; net = xi->net; } } rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); return 0; } nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ if (skb->sp) { int i; for (i = skb->sp->len-1; i >= 0; i--) { struct xfrm_state *x = skb->sp->xvec[i]; if (!xfrm_selector_match(&x->sel, &fl, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; } } } pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } } if (!pol) pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } if (!pol) { if (skb->sp && secpath_has_nontransport(skb->sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } return 1; } pol->curlft.use_time = ktime_get_real_seconds(); pols[0] = pol; npols++; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); xfrm_pol_put(pols[0]); return 0; } pols[1]->curlft.use_time = ktime_get_real_seconds(); npols++; } } #endif if (pol->action == XFRM_POLICY_ALLOW) { struct sec_path *sp; static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; struct xfrm_tmpl **tpp = tp; int ti = 0; int i, k; if ((sp = skb->sp) == NULL) sp = &dummy; for (pi = 0; pi < npols; pi++) { if (pols[pi] != pol && pols[pi]->action != XFRM_POLICY_ALLOW) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); goto reject; } if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto reject_error; } for (i = 0; i < pols[pi]->xfrm_nr; i++) tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family, net); tpp = stp; } /* For each tunnel xfrm, find the first matching tmpl. * For each tmpl before that, find corresponding xfrm. * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ xerr_idx = -(2+k); XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } } if (secpath_has_nontransport(sp, k, &xerr_idx)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } xfrm_pols_put(pols, npols); return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); reject: xfrm_secpath_reject(xerr_idx, skb, &fl); reject_error: xfrm_pols_put(pols, npols); return 0; } EXPORT_SYMBOL(__xfrm_policy_check); int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct flowi fl; struct dst_entry *dst; int res = 1; if (xfrm_decode_session(skb, &fl, family) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; } skb_dst_set(skb, dst); return res; } EXPORT_SYMBOL(__xfrm_route_forward); /* Optimize later using cookies and generation ids. */ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to * get validated by dst_ops->check on every use. We do this * because when a normal route referenced by an XFRM dst is * obsoleted we do not go looking around for all parent * referencing XFRM dsts so that we can invalidate them. It * is just too much work. Instead we make the checks here on * every use. For example: * * XFRM dst A --> IPv4 dst X * * X is the "xdst->route" of A (X is also the "dst->path" of A * in this example). If X is marked obsolete, "A" will not * notice. That's what we are validating here via the * stale_bundle() check. * * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will * be marked on it. * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst; return NULL; } static int stale_bundle(struct dst_entry *dst) { return !xfrm_bundle_ok((struct xfrm_dst *)dst); } void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = dev_net(dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); } } EXPORT_SYMBOL(xfrm_dst_ifdown); static void xfrm_link_failure(struct sk_buff *skb) { /* Impossible. Such dst must be popped before reaches point of failure. */ } static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) { if (dst) { if (dst->obsolete) { dst_release(dst); dst = NULL; } } return dst; } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { while (nr--) { struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; struct dst_entry *dst; dst = &xdst->u.dst; pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); route_mtu_cached = dst_mtu(xdst->route); xdst->route_mtu_cached = route_mtu_cached; if (pmtu > route_mtu_cached) pmtu = route_mtu_cached; dst_metric_set(dst, RTAX_MTU, pmtu); } } /* Check that the bundle accepts the flow and its components are * still valid. */ static int xfrm_bundle_ok(struct xfrm_dst *first) { struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; struct xfrm_dst *xdst; int start_from, nr; u32 mtu; if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; if (dst->flags & DST_XFRM_QUEUE) return 1; start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; if (xdst->xfrm_genid != dst->xfrm->genid) return 0; if (xdst->num_pols > 0 && xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; bundle[nr++] = xdst; mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { start_from = nr; xdst->child_mtu_cached = mtu; } if (!dst_check(xdst->route, xdst->route_cookie)) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { start_from = nr; xdst->route_mtu_cached = mtu; } dst = xfrm_dst_child(dst); } while (dst->xfrm); if (likely(!start_from)) return 1; xdst = bundle[start_from - 1]; mtu = xdst->child_mtu_cached; while (start_from--) { dst = &xdst->u.dst; mtu = xfrm_state_mtu(dst->xfrm, mtu); if (mtu > xdst->route_mtu_cached) mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); if (!start_from) break; xdst = bundle[start_from - 1]; xdst->child_mtu_cached = mtu; } return 1; } static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; dst = xfrm_dst_child(dst); if (xfrm->props.mode == XFRM_MODE_TRANSPORT) continue; if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; } return daddr; } static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); return path->ops->neigh_lookup(path, skb, daddr); } static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); } int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family) { int err = 0; if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[family] != NULL)) err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) dst_ops->kmem_cachep = xfrm_dst_cache; if (likely(dst_ops->check == NULL)) dst_ops->check = xfrm_dst_check; if (likely(dst_ops->default_advmss == NULL)) dst_ops->default_advmss = xfrm_default_advmss; if (likely(dst_ops->mtu == NULL)) dst_ops->mtu = xfrm_mtu; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) dst_ops->link_failure = xfrm_link_failure; if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(!dst_ops->confirm_neigh)) dst_ops->confirm_neigh = xfrm_confirm_neigh; rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo); } spin_unlock(&xfrm_policy_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) { struct dst_ops *dst_ops = afinfo->dst_ops; int i; for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) { if (xfrm_policy_afinfo[i] != afinfo) continue; RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL); break; } synchronize_rcu(); dst_ops->kmem_cachep = NULL; dst_ops->check = NULL; dst_ops->negative_advice = NULL; dst_ops->link_failure = NULL; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) { spin_lock(&xfrm_if_cb_lock); rcu_assign_pointer(xfrm_if_cb, ifcb); spin_unlock(&xfrm_if_cb_lock); } EXPORT_SYMBOL(xfrm_if_register_cb); void xfrm_if_unregister_cb(void) { RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } EXPORT_SYMBOL(xfrm_if_unregister_cb); #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { int rv; net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib); if (!net->mib.xfrm_statistics) return -ENOMEM; rv = xfrm_proc_init(net); if (rv < 0) free_percpu(net->mib.xfrm_statistics); return rv; } static void xfrm_statistics_fini(struct net *net) { xfrm_proc_fini(net); free_percpu(net->mib.xfrm_statistics); } #else static int __net_init xfrm_statistics_init(struct net *net) { return 0; } static void xfrm_statistics_fini(struct net *net) { } #endif static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; int dir; if (net_eq(net, &init_net)) xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", sizeof(struct xfrm_dst), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); net->xfrm.policy_byidx = xfrm_hash_alloc(sz); if (!net->xfrm.policy_byidx) goto out_byidx; net->xfrm.policy_idx_hmask = hmask; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; net->xfrm.policy_count[dir] = 0; net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); htab = &net->xfrm.policy_bydst[dir]; htab->table = xfrm_hash_alloc(sz); if (!htab->table) goto out_bydst; htab->hmask = hmask; htab->dbits4 = 32; htab->sbits4 = 32; htab->dbits6 = 128; htab->sbits6 = 128; } net->xfrm.policy_hthresh.lbits4 = 32; net->xfrm.policy_hthresh.rbits4 = 32; net->xfrm.policy_hthresh.lbits6 = 128; net->xfrm.policy_hthresh.rbits6 = 128; seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; out_bydst: for (dir--; dir >= 0; dir--) { struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; xfrm_hash_free(htab->table, sz); } xfrm_hash_free(net->xfrm.policy_byidx, sz); out_byidx: return -ENOMEM; } static void xfrm_policy_fini(struct net *net) { unsigned int sz; int dir; flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); WARN_ON(!list_empty(&net->xfrm.policy_all)); for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir])); htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(htab->table)); xfrm_hash_free(htab->table, sz); } sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); } static int __net_init xfrm_net_init(struct net *net) { int rv; /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); rv = xfrm_statistics_init(net); if (rv < 0) goto out_statistics; rv = xfrm_state_init(net); if (rv < 0) goto out_state; rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; return 0; out_sysctl: xfrm_policy_fini(net); out_policy: xfrm_state_fini(net); out_state: xfrm_statistics_fini(net); out_statistics: return rv; } static void __net_exit xfrm_net_exit(struct net *net) { xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); xfrm_statistics_fini(net); } static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .exit = xfrm_net_exit, }; void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } #ifdef CONFIG_AUDITSYSCALL static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, struct audit_buffer *audit_buf) { struct xfrm_sec_ctx *ctx = xp->security; struct xfrm_selector *sel = &xp->selector; if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch (sel->family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4); if (sel->prefixlen_s != 32) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4); if (sel->prefixlen_d != 32) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; case AF_INET6: audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6); if (sel->prefixlen_s != 128) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6); if (sel->prefixlen_d != 128) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; } } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-add"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-delete"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, const struct xfrm_selector *sel_tgt) { if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { if (sel_tgt->family == sel_cmp->family && xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr, sel_cmp->family) && xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr, sel_cmp->family) && sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { return true; } } else { if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) { return true; } } return false; } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; u32 priority = ~0U; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { if ((if_id == 0 || pol->if_id == if_id) && xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; break; } } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst) { if ((pol->priority >= priority) && ret) break; if ((if_id == 0 || pol->if_id == if_id) && xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; } } xfrm_pol_hold(ret); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) { int match = 0; if (t->mode == m->mode && t->id.proto == m->proto && (m->reqid == 0 || t->reqid == m->reqid)) { switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, m->old_family)) { match = 1; } break; case XFRM_MODE_TRANSPORT: /* in case of transport mode, template does not store any IP addresses, hence we just compare mode and protocol */ match = 1; break; default: break; } } return match; } /* update endpoint address(es) of template(s) */ static int xfrm_policy_migrate(struct xfrm_policy *pol, struct xfrm_migrate *m, int num_migrate) { struct xfrm_migrate *mp; int i, j, n = 0; write_lock_bh(&pol->lock); if (unlikely(pol->walk.dead)) { /* target policy has been deleted */ write_unlock_bh(&pol->lock); return -ENOENT; } for (i = 0; i < pol->xfrm_nr; i++) { for (j = 0, mp = m; j < num_migrate; j++, mp++) { if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i])) continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && pol->xfrm_vec[i].mode != XFRM_MODE_BEET) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, sizeof(pol->xfrm_vec[i].id.daddr)); memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr, sizeof(pol->xfrm_vec[i].saddr)); pol->xfrm_vec[i].encap_family = mp->new_family; /* flush bundles */ atomic_inc(&pol->genid); } } write_unlock_bh(&pol->lock); if (!n) return -ENODATA; return 0; } static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) { int i, j; if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) return -EINVAL; for (i = 0; i < num_migrate; i++) { if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) return -EINVAL; /* check if there is any duplicated entry */ for (j = i + 1; j < num_migrate; j++) { if (!memcmp(&m[i].old_daddr, &m[j].old_daddr, sizeof(m[i].old_daddr)) && !memcmp(&m[i].old_saddr, &m[j].old_saddr, sizeof(m[i].old_saddr)) && m[i].proto == m[j].proto && m[i].mode == m[j].mode && m[i].reqid == m[j].reqid && m[i].old_family == m[j].old_family) return -EINVAL; } } return 0; } int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; struct xfrm_state *x, *xc; struct xfrm_state *x_cur[XFRM_MAX_DEPTH]; struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ if ((err = xfrm_migrate_check(m, num_migrate)) < 0) goto out; if (dir >= XFRM_POLICY_MAX) { err = -EINVAL; goto out; } /* Stage 1 - find policy */ if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) { err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); if (xc) { x_new[nx_new] = xc; nx_new++; } else { err = -ENODATA; goto restore_state; } } } /* Stage 3 - update policy */ if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0) goto restore_state; /* Stage 4 - delete old state(s) */ if (nx_cur) { xfrm_states_put(x_cur, nx_cur); xfrm_states_delete(x_cur, nx_cur); } /* Stage 5 - announce */ km_migrate(sel, dir, type, m, num_migrate, k, encap); xfrm_pol_put(pol); return 0; out: return err; restore_state: if (pol) xfrm_pol_put(pol); if (nx_cur) xfrm_states_put(x_cur, nx_cur); if (nx_new) xfrm_states_delete(x_new, nx_new); return err; } EXPORT_SYMBOL(xfrm_migrate); #endif |
3 270 277 182 266 233 89 182 6 6 6 6 110 240 238 240 240 239 225 237 238 3 237 224 224 235 235 235 234 235 235 228 227 228 235 9 228 227 235 235 235 234 227 234 231 234 220 234 234 234 240 225 240 223 234 6 5 4 5 218 218 239 239 141 178 218 218 218 218 217 218 218 218 222 222 222 4 221 218 216 217 218 218 218 218 218 217 72 72 218 218 51 217 4 222 3 2 2 91 2 2 1 1 1 1 220 219 219 218 1 218 217 218 218 218 218 3 4 34 34 2 2 1 1 1 2 2 3 2 4 2 2 2 2 3 178 178 177 178 202 199 199 81 3 184 184 142 140 140 142 141 142 23 142 44 43 62 1 91 91 90 91 90 91 58 91 91 1 1 1 1 1 1 1 2 182 182 182 181 181 182 182 182 1 182 182 34 34 34 34 34 25 2 23 8 34 10 8 35 34 2 32 34 34 33 34 33 33 33 9 9 8 9 9 9 9 33 34 228 228 90 2 1 1 1 1 2 2 227 227 221 221 226 226 226 226 226 225 226 226 226 226 226 226 218 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 110 34 36 36 36 36 33 35 35 32 35 35 35 116 113 10 116 110 110 110 110 34 31 21 31 18 18 18 16 16 15 34 34 34 1 1 3 4 33 3 3 3 3 41 19 7 5 5 5 5 3 2 2 7 6 6 3 35 35 34 35 7 5 5 1 4 3 1 4 2 2 4 7 2 1 4 3 1 4 5 4 4 3 2 5 5 3 3 2 93 94 94 94 2 2 4 2 4 1 3 3 1 3 2 1 1 7 3 1 1 94 95 242 5 5 218 218 182 114 223 34 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 8 5 8 1 7 1 1 1 3 3 3 8 5 3 6 5 5 4 3 3 3 8 3 4 4 4 4 4 4 4 15 14 12 12 14 15 3 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 | /* * Digital Audio (PCM) abstract layer * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/mm.h> #include <linux/module.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/time.h> #include <linux/pm_qos.h> #include <linux/io.h> #include <linux/dma-mapping.h> #include <sound/core.h> #include <sound/control.h> #include <sound/info.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include <sound/timer.h> #include <sound/minors.h> #include <linux/uio.h> #include <linux/delay.h> #include "pcm_local.h" #ifdef CONFIG_SND_DEBUG #define CREATE_TRACE_POINTS #include "pcm_param_trace.h" #else #define trace_hw_mask_param_enabled() 0 #define trace_hw_interval_param_enabled() 0 #define trace_hw_mask_param(substream, type, index, prev, curr) #define trace_hw_interval_param(substream, type, index, prev, curr) #endif /* * Compatibility */ struct snd_pcm_hw_params_old { unsigned int flags; unsigned int masks[SNDRV_PCM_HW_PARAM_SUBFORMAT - SNDRV_PCM_HW_PARAM_ACCESS + 1]; struct snd_interval intervals[SNDRV_PCM_HW_PARAM_TICK_TIME - SNDRV_PCM_HW_PARAM_SAMPLE_BITS + 1]; unsigned int rmask; unsigned int cmask; unsigned int info; unsigned int msbits; unsigned int rate_num; unsigned int rate_den; snd_pcm_uframes_t fifo_size; unsigned char reserved[64]; }; #ifdef CONFIG_SND_SUPPORT_OLD_API #define SNDRV_PCM_IOCTL_HW_REFINE_OLD _IOWR('A', 0x10, struct snd_pcm_hw_params_old) #define SNDRV_PCM_IOCTL_HW_PARAMS_OLD _IOWR('A', 0x11, struct snd_pcm_hw_params_old) static int snd_pcm_hw_refine_old_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params_old __user * _oparams); static int snd_pcm_hw_params_old_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params_old __user * _oparams); #endif static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream); /* * */ static DEFINE_RWLOCK(snd_pcm_link_rwlock); static DECLARE_RWSEM(snd_pcm_link_rwsem); /* Writer in rwsem may block readers even during its waiting in queue, * and this may lead to a deadlock when the code path takes read sem * twice (e.g. one in snd_pcm_action_nonatomic() and another in * snd_pcm_stream_lock()). As a (suboptimal) workaround, let writer to * sleep until all the readers are completed without blocking by writer. */ static inline void down_write_nonfifo(struct rw_semaphore *lock) { while (!down_write_trylock(lock)) msleep(1); } #define PCM_LOCK_DEFAULT 0 #define PCM_LOCK_IRQ 1 #define PCM_LOCK_IRQSAVE 2 static unsigned long __snd_pcm_stream_lock_mode(struct snd_pcm_substream *substream, unsigned int mode) { unsigned long flags = 0; if (substream->pcm->nonatomic) { down_read_nested(&snd_pcm_link_rwsem, SINGLE_DEPTH_NESTING); mutex_lock(&substream->self_group.mutex); } else { switch (mode) { case PCM_LOCK_DEFAULT: read_lock(&snd_pcm_link_rwlock); break; case PCM_LOCK_IRQ: read_lock_irq(&snd_pcm_link_rwlock); break; case PCM_LOCK_IRQSAVE: read_lock_irqsave(&snd_pcm_link_rwlock, flags); break; } spin_lock(&substream->self_group.lock); } return flags; } static void __snd_pcm_stream_unlock_mode(struct snd_pcm_substream *substream, unsigned int mode, unsigned long flags) { if (substream->pcm->nonatomic) { mutex_unlock(&substream->self_group.mutex); up_read(&snd_pcm_link_rwsem); } else { spin_unlock(&substream->self_group.lock); switch (mode) { case PCM_LOCK_DEFAULT: read_unlock(&snd_pcm_link_rwlock); break; case PCM_LOCK_IRQ: read_unlock_irq(&snd_pcm_link_rwlock); break; case PCM_LOCK_IRQSAVE: read_unlock_irqrestore(&snd_pcm_link_rwlock, flags); break; } } } /** * snd_pcm_stream_lock - Lock the PCM stream * @substream: PCM substream * * This locks the PCM stream's spinlock or mutex depending on the nonatomic * flag of the given substream. This also takes the global link rw lock * (or rw sem), too, for avoiding the race with linked streams. */ void snd_pcm_stream_lock(struct snd_pcm_substream *substream) { __snd_pcm_stream_lock_mode(substream, PCM_LOCK_DEFAULT); } EXPORT_SYMBOL_GPL(snd_pcm_stream_lock); /** * snd_pcm_stream_lock - Unlock the PCM stream * @substream: PCM substream * * This unlocks the PCM stream that has been locked via snd_pcm_stream_lock(). */ void snd_pcm_stream_unlock(struct snd_pcm_substream *substream) { __snd_pcm_stream_unlock_mode(substream, PCM_LOCK_DEFAULT, 0); } EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock); /** * snd_pcm_stream_lock_irq - Lock the PCM stream * @substream: PCM substream * * This locks the PCM stream like snd_pcm_stream_lock() and disables the local * IRQ (only when nonatomic is false). In nonatomic case, this is identical * as snd_pcm_stream_lock(). */ void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream) { __snd_pcm_stream_lock_mode(substream, PCM_LOCK_IRQ); } EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq); /** * snd_pcm_stream_unlock_irq - Unlock the PCM stream * @substream: PCM substream * * This is a counter-part of snd_pcm_stream_lock_irq(). */ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream) { __snd_pcm_stream_unlock_mode(substream, PCM_LOCK_IRQ, 0); } EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq); unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream) { return __snd_pcm_stream_lock_mode(substream, PCM_LOCK_IRQSAVE); } EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave); /** * snd_pcm_stream_unlock_irqrestore - Unlock the PCM stream * @substream: PCM substream * @flags: irq flags * * This is a counter-part of snd_pcm_stream_lock_irqsave(). */ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream, unsigned long flags) { __snd_pcm_stream_unlock_mode(substream, PCM_LOCK_IRQSAVE, flags); } EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore); int snd_pcm_info(struct snd_pcm_substream *substream, struct snd_pcm_info *info) { struct snd_pcm *pcm = substream->pcm; struct snd_pcm_str *pstr = substream->pstr; memset(info, 0, sizeof(*info)); info->card = pcm->card->number; info->device = pcm->device; info->stream = substream->stream; info->subdevice = substream->number; strlcpy(info->id, pcm->id, sizeof(info->id)); strlcpy(info->name, pcm->name, sizeof(info->name)); info->dev_class = pcm->dev_class; info->dev_subclass = pcm->dev_subclass; info->subdevices_count = pstr->substream_count; info->subdevices_avail = pstr->substream_count - pstr->substream_opened; strlcpy(info->subname, substream->name, sizeof(info->subname)); return 0; } int snd_pcm_info_user(struct snd_pcm_substream *substream, struct snd_pcm_info __user * _info) { struct snd_pcm_info *info; int err; info = kmalloc(sizeof(*info), GFP_KERNEL); if (! info) return -ENOMEM; err = snd_pcm_info(substream, info); if (err >= 0) { if (copy_to_user(_info, info, sizeof(*info))) err = -EFAULT; } kfree(info); return err; } static bool hw_support_mmap(struct snd_pcm_substream *substream) { if (!(substream->runtime->hw.info & SNDRV_PCM_INFO_MMAP)) return false; /* architecture supports dma_mmap_coherent()? */ #if defined(CONFIG_ARCH_NO_COHERENT_DMA_MMAP) || !defined(CONFIG_HAS_DMA) if (!substream->ops->mmap && substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV) return false; #endif return true; } static int constrain_mask_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_hw_constraints *constrs = &substream->runtime->hw_constraints; struct snd_mask *m; unsigned int k; struct snd_mask old_mask; int changed; for (k = SNDRV_PCM_HW_PARAM_FIRST_MASK; k <= SNDRV_PCM_HW_PARAM_LAST_MASK; k++) { m = hw_param_mask(params, k); if (snd_mask_empty(m)) return -EINVAL; /* This parameter is not requested to change by a caller. */ if (!(params->rmask & (1 << k))) continue; if (trace_hw_mask_param_enabled()) old_mask = *m; changed = snd_mask_refine(m, constrs_mask(constrs, k)); if (changed < 0) return changed; if (changed == 0) continue; /* Set corresponding flag so that the caller gets it. */ trace_hw_mask_param(substream, k, 0, &old_mask, m); params->cmask |= 1 << k; } return 0; } static int constrain_interval_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_hw_constraints *constrs = &substream->runtime->hw_constraints; struct snd_interval *i; unsigned int k; struct snd_interval old_interval; int changed; for (k = SNDRV_PCM_HW_PARAM_FIRST_INTERVAL; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) { i = hw_param_interval(params, k); if (snd_interval_empty(i)) return -EINVAL; /* This parameter is not requested to change by a caller. */ if (!(params->rmask & (1 << k))) continue; if (trace_hw_interval_param_enabled()) old_interval = *i; changed = snd_interval_refine(i, constrs_interval(constrs, k)); if (changed < 0) return changed; if (changed == 0) continue; /* Set corresponding flag so that the caller gets it. */ trace_hw_interval_param(substream, k, 0, &old_interval, i); params->cmask |= 1 << k; } return 0; } static int constrain_params_by_rules(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_hw_constraints *constrs = &substream->runtime->hw_constraints; unsigned int k; unsigned int *rstamps; unsigned int vstamps[SNDRV_PCM_HW_PARAM_LAST_INTERVAL + 1]; unsigned int stamp; struct snd_pcm_hw_rule *r; unsigned int d; struct snd_mask old_mask; struct snd_interval old_interval; bool again; int changed, err = 0; /* * Each application of rule has own sequence number. * * Each member of 'rstamps' array represents the sequence number of * recent application of corresponding rule. */ rstamps = kcalloc(constrs->rules_num, sizeof(unsigned int), GFP_KERNEL); if (!rstamps) return -ENOMEM; /* * Each member of 'vstamps' array represents the sequence number of * recent application of rule in which corresponding parameters were * changed. * * In initial state, elements corresponding to parameters requested by * a caller is 1. For unrequested parameters, corresponding members * have 0 so that the parameters are never changed anymore. */ for (k = 0; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) vstamps[k] = (params->rmask & (1 << k)) ? 1 : 0; /* Due to the above design, actual sequence number starts at 2. */ stamp = 2; retry: /* Apply all rules in order. */ again = false; for (k = 0; k < constrs->rules_num; k++) { r = &constrs->rules[k]; /* * Check condition bits of this rule. When the rule has * some condition bits, parameter without the bits is * never processed. SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP * is an example of the condition bits. */ if (r->cond && !(r->cond & params->flags)) continue; /* * The 'deps' array includes maximum three dependencies * to SNDRV_PCM_HW_PARAM_XXXs for this rule. The fourth * member of this array is a sentinel and should be * negative value. * * This rule should be processed in this time when dependent * parameters were changed at former applications of the other * rules. */ for (d = 0; r->deps[d] >= 0; d++) { if (vstamps[r->deps[d]] > rstamps[k]) break; } if (r->deps[d] < 0) continue; if (trace_hw_mask_param_enabled()) { if (hw_is_mask(r->var)) old_mask = *hw_param_mask(params, r->var); } if (trace_hw_interval_param_enabled()) { if (hw_is_interval(r->var)) old_interval = *hw_param_interval(params, r->var); } changed = r->func(params, r); if (changed < 0) { err = changed; goto out; } /* * When the parameter is changed, notify it to the caller * by corresponding returned bit, then preparing for next * iteration. */ if (changed && r->var >= 0) { if (hw_is_mask(r->var)) { trace_hw_mask_param(substream, r->var, k + 1, &old_mask, hw_param_mask(params, r->var)); } if (hw_is_interval(r->var)) { trace_hw_interval_param(substream, r->var, k + 1, &old_interval, hw_param_interval(params, r->var)); } params->cmask |= (1 << r->var); vstamps[r->var] = stamp; again = true; } rstamps[k] = stamp++; } /* Iterate to evaluate all rules till no parameters are changed. */ if (again) goto retry; out: kfree(rstamps); return err; } static int fixup_unreferenced_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { const struct snd_interval *i; const struct snd_mask *m; int err; if (!params->msbits) { i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_SAMPLE_BITS); if (snd_interval_single(i)) params->msbits = snd_interval_value(i); } if (!params->rate_den) { i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_RATE); if (snd_interval_single(i)) { params->rate_num = snd_interval_value(i); params->rate_den = 1; } } if (!params->fifo_size) { m = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT); i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_CHANNELS); if (snd_mask_single(m) && snd_interval_single(i)) { err = substream->ops->ioctl(substream, SNDRV_PCM_IOCTL1_FIFO_SIZE, params); if (err < 0) return err; } } if (!params->info) { params->info = substream->runtime->hw.info; params->info &= ~(SNDRV_PCM_INFO_FIFO_IN_FRAMES | SNDRV_PCM_INFO_DRAIN_TRIGGER); if (!hw_support_mmap(substream)) params->info &= ~(SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID); } return 0; } int snd_pcm_hw_refine(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { int err; params->info = 0; params->fifo_size = 0; if (params->rmask & (1 << SNDRV_PCM_HW_PARAM_SAMPLE_BITS)) params->msbits = 0; if (params->rmask & (1 << SNDRV_PCM_HW_PARAM_RATE)) { params->rate_num = 0; params->rate_den = 0; } err = constrain_mask_params(substream, params); if (err < 0) return err; err = constrain_interval_params(substream, params); if (err < 0) return err; err = constrain_params_by_rules(substream, params); if (err < 0) return err; params->rmask = 0; return 0; } EXPORT_SYMBOL(snd_pcm_hw_refine); static int snd_pcm_hw_refine_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params __user * _params) { struct snd_pcm_hw_params *params; int err; params = memdup_user(_params, sizeof(*params)); if (IS_ERR(params)) return PTR_ERR(params); err = snd_pcm_hw_refine(substream, params); if (err < 0) goto end; err = fixup_unreferenced_params(substream, params); if (err < 0) goto end; if (copy_to_user(_params, params, sizeof(*params))) err = -EFAULT; end: kfree(params); return err; } static int period_to_usecs(struct snd_pcm_runtime *runtime) { int usecs; if (! runtime->rate) return -1; /* invalid */ /* take 75% of period time as the deadline */ usecs = (750000 / runtime->rate) * runtime->period_size; usecs += ((750000 % runtime->rate) * runtime->period_size) / runtime->rate; return usecs; } static void snd_pcm_set_state(struct snd_pcm_substream *substream, int state) { snd_pcm_stream_lock_irq(substream); if (substream->runtime->status->state != SNDRV_PCM_STATE_DISCONNECTED) substream->runtime->status->state = state; snd_pcm_stream_unlock_irq(substream); } static inline void snd_pcm_timer_notify(struct snd_pcm_substream *substream, int event) { #ifdef CONFIG_SND_PCM_TIMER if (substream->timer) snd_timer_notify(substream->timer, event, &substream->runtime->trigger_tstamp); #endif } /** * snd_pcm_hw_param_choose - choose a configuration defined by @params * @pcm: PCM instance * @params: the hw_params instance * * Choose one configuration from configuration space defined by @params. * The configuration chosen is that obtained fixing in this order: * first access, first format, first subformat, min channels, * min rate, min period time, max buffer size, min tick time * * Return: Zero if successful, or a negative error code on failure. */ static int snd_pcm_hw_params_choose(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params) { static const int vars[] = { SNDRV_PCM_HW_PARAM_ACCESS, SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_HW_PARAM_SUBFORMAT, SNDRV_PCM_HW_PARAM_CHANNELS, SNDRV_PCM_HW_PARAM_RATE, SNDRV_PCM_HW_PARAM_PERIOD_TIME, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_TICK_TIME, -1 }; const int *v; struct snd_mask old_mask; struct snd_interval old_interval; int changed; for (v = vars; *v != -1; v++) { /* Keep old parameter to trace. */ if (trace_hw_mask_param_enabled()) { if (hw_is_mask(*v)) old_mask = *hw_param_mask(params, *v); } if (trace_hw_interval_param_enabled()) { if (hw_is_interval(*v)) old_interval = *hw_param_interval(params, *v); } if (*v != SNDRV_PCM_HW_PARAM_BUFFER_SIZE) changed = snd_pcm_hw_param_first(pcm, params, *v, NULL); else changed = snd_pcm_hw_param_last(pcm, params, *v, NULL); if (changed < 0) return changed; if (changed == 0) continue; /* Trace the changed parameter. */ if (hw_is_mask(*v)) { trace_hw_mask_param(pcm, *v, 0, &old_mask, hw_param_mask(params, *v)); } if (hw_is_interval(*v)) { trace_hw_interval_param(pcm, *v, 0, &old_interval, hw_param_interval(params, *v)); } } return 0; } /* acquire buffer_mutex; if it's in r/w operation, return -EBUSY, otherwise * block the further r/w operations */ static int snd_pcm_buffer_access_lock(struct snd_pcm_runtime *runtime) { if (!atomic_dec_unless_positive(&runtime->buffer_accessing)) return -EBUSY; mutex_lock(&runtime->buffer_mutex); return 0; /* keep buffer_mutex, unlocked by below */ } /* release buffer_mutex and clear r/w access flag */ static void snd_pcm_buffer_access_unlock(struct snd_pcm_runtime *runtime) { mutex_unlock(&runtime->buffer_mutex); atomic_inc(&runtime->buffer_accessing); } #if IS_ENABLED(CONFIG_SND_PCM_OSS) #define is_oss_stream(substream) ((substream)->oss.oss) #else #define is_oss_stream(substream) false #endif static int snd_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_runtime *runtime; int err, usecs; unsigned int bits; snd_pcm_uframes_t frames; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; err = snd_pcm_buffer_access_lock(runtime); if (err < 0) return err; snd_pcm_stream_lock_irq(substream); switch (runtime->status->state) { case SNDRV_PCM_STATE_OPEN: case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_PREPARED: if (!is_oss_stream(substream) && atomic_read(&substream->mmap_count)) err = -EBADFD; break; default: err = -EBADFD; break; } snd_pcm_stream_unlock_irq(substream); if (err) goto unlock; params->rmask = ~0U; err = snd_pcm_hw_refine(substream, params); if (err < 0) goto _error; err = snd_pcm_hw_params_choose(substream, params); if (err < 0) goto _error; err = fixup_unreferenced_params(substream, params); if (err < 0) goto _error; if (substream->ops->hw_params != NULL) { err = substream->ops->hw_params(substream, params); if (err < 0) goto _error; } runtime->access = params_access(params); runtime->format = params_format(params); runtime->subformat = params_subformat(params); runtime->channels = params_channels(params); runtime->rate = params_rate(params); runtime->period_size = params_period_size(params); runtime->periods = params_periods(params); runtime->buffer_size = params_buffer_size(params); runtime->info = params->info; runtime->rate_num = params->rate_num; runtime->rate_den = params->rate_den; runtime->no_period_wakeup = (params->info & SNDRV_PCM_INFO_NO_PERIOD_WAKEUP) && (params->flags & SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP); bits = snd_pcm_format_physical_width(runtime->format); runtime->sample_bits = bits; bits *= runtime->channels; runtime->frame_bits = bits; frames = 1; while (bits % 8 != 0) { bits *= 2; frames *= 2; } runtime->byte_align = bits / 8; runtime->min_align = frames; /* Default sw params */ runtime->tstamp_mode = SNDRV_PCM_TSTAMP_NONE; runtime->period_step = 1; runtime->control->avail_min = runtime->period_size; runtime->start_threshold = 1; runtime->stop_threshold = runtime->buffer_size; runtime->silence_threshold = 0; runtime->silence_size = 0; runtime->boundary = runtime->buffer_size; while (runtime->boundary * 2 <= LONG_MAX - runtime->buffer_size) runtime->boundary *= 2; /* clear the buffer for avoiding possible kernel info leaks */ if (runtime->dma_area && !substream->ops->copy_user) { size_t size = runtime->dma_bytes; if (runtime->info & SNDRV_PCM_INFO_MMAP) size = PAGE_ALIGN(size); memset(runtime->dma_area, 0, size); } snd_pcm_timer_resolution_change(substream); snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP); if (pm_qos_request_active(&substream->latency_pm_qos_req)) pm_qos_remove_request(&substream->latency_pm_qos_req); if ((usecs = period_to_usecs(runtime)) >= 0) pm_qos_add_request(&substream->latency_pm_qos_req, PM_QOS_CPU_DMA_LATENCY, usecs); err = 0; _error: if (err) { /* hardware might be unusable from this time, * so we force application to retry to set * the correct hardware parameter settings */ snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); if (substream->ops->hw_free != NULL) substream->ops->hw_free(substream); } unlock: snd_pcm_buffer_access_unlock(runtime); return err; } static int snd_pcm_hw_params_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params __user * _params) { struct snd_pcm_hw_params *params; int err; params = memdup_user(_params, sizeof(*params)); if (IS_ERR(params)) return PTR_ERR(params); err = snd_pcm_hw_params(substream, params); if (err < 0) goto end; if (copy_to_user(_params, params, sizeof(*params))) err = -EFAULT; end: kfree(params); return err; } static int snd_pcm_hw_free(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; int result = 0; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; result = snd_pcm_buffer_access_lock(runtime); if (result < 0) return result; snd_pcm_stream_lock_irq(substream); switch (runtime->status->state) { case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_PREPARED: if (atomic_read(&substream->mmap_count)) result = -EBADFD; break; default: result = -EBADFD; break; } snd_pcm_stream_unlock_irq(substream); if (result) goto unlock; if (substream->ops->hw_free) result = substream->ops->hw_free(substream); snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); pm_qos_remove_request(&substream->latency_pm_qos_req); unlock: snd_pcm_buffer_access_unlock(runtime); return result; } static int snd_pcm_sw_params(struct snd_pcm_substream *substream, struct snd_pcm_sw_params *params) { struct snd_pcm_runtime *runtime; int err; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; snd_pcm_stream_lock_irq(substream); if (runtime->status->state == SNDRV_PCM_STATE_OPEN) { snd_pcm_stream_unlock_irq(substream); return -EBADFD; } snd_pcm_stream_unlock_irq(substream); if (params->tstamp_mode < 0 || params->tstamp_mode > SNDRV_PCM_TSTAMP_LAST) return -EINVAL; if (params->proto >= SNDRV_PROTOCOL_VERSION(2, 0, 12) && params->tstamp_type > SNDRV_PCM_TSTAMP_TYPE_LAST) return -EINVAL; if (params->avail_min == 0) return -EINVAL; if (params->silence_size >= runtime->boundary) { if (params->silence_threshold != 0) return -EINVAL; } else { if (params->silence_size > params->silence_threshold) return -EINVAL; if (params->silence_threshold > runtime->buffer_size) return -EINVAL; } err = 0; snd_pcm_stream_lock_irq(substream); runtime->tstamp_mode = params->tstamp_mode; if (params->proto >= SNDRV_PROTOCOL_VERSION(2, 0, 12)) runtime->tstamp_type = params->tstamp_type; runtime->period_step = params->period_step; runtime->control->avail_min = params->avail_min; runtime->start_threshold = params->start_threshold; runtime->stop_threshold = params->stop_threshold; runtime->silence_threshold = params->silence_threshold; runtime->silence_size = params->silence_size; params->boundary = runtime->boundary; if (snd_pcm_running(substream)) { if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && runtime->silence_size > 0) snd_pcm_playback_silence(substream, ULONG_MAX); err = snd_pcm_update_state(substream, runtime); } snd_pcm_stream_unlock_irq(substream); return err; } static int snd_pcm_sw_params_user(struct snd_pcm_substream *substream, struct snd_pcm_sw_params __user * _params) { struct snd_pcm_sw_params params; int err; if (copy_from_user(¶ms, _params, sizeof(params))) return -EFAULT; err = snd_pcm_sw_params(substream, ¶ms); if (copy_to_user(_params, ¶ms, sizeof(params))) return -EFAULT; return err; } static inline snd_pcm_uframes_t snd_pcm_calc_delay(struct snd_pcm_substream *substream) { snd_pcm_uframes_t delay; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) delay = snd_pcm_playback_hw_avail(substream->runtime); else delay = snd_pcm_capture_avail(substream->runtime); return delay + substream->runtime->delay; } int snd_pcm_status(struct snd_pcm_substream *substream, struct snd_pcm_status *status) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_stream_lock_irq(substream); snd_pcm_unpack_audio_tstamp_config(status->audio_tstamp_data, &runtime->audio_tstamp_config); /* backwards compatible behavior */ if (runtime->audio_tstamp_config.type_requested == SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT) { if (runtime->hw.info & SNDRV_PCM_INFO_HAS_WALL_CLOCK) runtime->audio_tstamp_config.type_requested = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK; else runtime->audio_tstamp_config.type_requested = SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT; runtime->audio_tstamp_report.valid = 0; } else runtime->audio_tstamp_report.valid = 1; status->state = runtime->status->state; status->suspended_state = runtime->status->suspended_state; if (status->state == SNDRV_PCM_STATE_OPEN) goto _end; status->trigger_tstamp = runtime->trigger_tstamp; if (snd_pcm_running(substream)) { snd_pcm_update_hw_ptr(substream); if (runtime->tstamp_mode == SNDRV_PCM_TSTAMP_ENABLE) { status->tstamp = runtime->status->tstamp; status->driver_tstamp = runtime->driver_tstamp; status->audio_tstamp = runtime->status->audio_tstamp; if (runtime->audio_tstamp_report.valid == 1) /* backwards compatibility, no report provided in COMPAT mode */ snd_pcm_pack_audio_tstamp_report(&status->audio_tstamp_data, &status->audio_tstamp_accuracy, &runtime->audio_tstamp_report); goto _tstamp_end; } } else { /* get tstamp only in fallback mode and only if enabled */ if (runtime->tstamp_mode == SNDRV_PCM_TSTAMP_ENABLE) snd_pcm_gettime(runtime, &status->tstamp); } _tstamp_end: status->appl_ptr = runtime->control->appl_ptr; status->hw_ptr = runtime->status->hw_ptr; status->avail = snd_pcm_avail(substream); status->delay = snd_pcm_running(substream) ? snd_pcm_calc_delay(substream) : 0; status->avail_max = runtime->avail_max; status->overrange = runtime->overrange; runtime->avail_max = 0; runtime->overrange = 0; _end: snd_pcm_stream_unlock_irq(substream); return 0; } static int snd_pcm_status_user(struct snd_pcm_substream *substream, struct snd_pcm_status __user * _status, bool ext) { struct snd_pcm_status status; int res; memset(&status, 0, sizeof(status)); /* * with extension, parameters are read/write, * get audio_tstamp_data from user, * ignore rest of status structure */ if (ext && get_user(status.audio_tstamp_data, (u32 __user *)(&_status->audio_tstamp_data))) return -EFAULT; res = snd_pcm_status(substream, &status); if (res < 0) return res; if (copy_to_user(_status, &status, sizeof(status))) return -EFAULT; return 0; } static int snd_pcm_channel_info(struct snd_pcm_substream *substream, struct snd_pcm_channel_info * info) { struct snd_pcm_runtime *runtime; unsigned int channel; channel = info->channel; runtime = substream->runtime; snd_pcm_stream_lock_irq(substream); if (runtime->status->state == SNDRV_PCM_STATE_OPEN) { snd_pcm_stream_unlock_irq(substream); return -EBADFD; } snd_pcm_stream_unlock_irq(substream); if (channel >= runtime->channels) return -EINVAL; memset(info, 0, sizeof(*info)); info->channel = channel; return substream->ops->ioctl(substream, SNDRV_PCM_IOCTL1_CHANNEL_INFO, info); } static int snd_pcm_channel_info_user(struct snd_pcm_substream *substream, struct snd_pcm_channel_info __user * _info) { struct snd_pcm_channel_info info; int res; if (copy_from_user(&info, _info, sizeof(info))) return -EFAULT; res = snd_pcm_channel_info(substream, &info); if (res < 0) return res; if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static void snd_pcm_trigger_tstamp(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->trigger_master == NULL) return; if (runtime->trigger_master == substream) { if (!runtime->trigger_tstamp_latched) snd_pcm_gettime(runtime, &runtime->trigger_tstamp); } else { snd_pcm_trigger_tstamp(runtime->trigger_master); runtime->trigger_tstamp = runtime->trigger_master->runtime->trigger_tstamp; } runtime->trigger_master = NULL; } struct action_ops { int (*pre_action)(struct snd_pcm_substream *substream, int state); int (*do_action)(struct snd_pcm_substream *substream, int state); void (*undo_action)(struct snd_pcm_substream *substream, int state); void (*post_action)(struct snd_pcm_substream *substream, int state); }; /* * this functions is core for handling of linked stream * Note: the stream state might be changed also on failure * Note2: call with calling stream lock + link lock */ static int snd_pcm_action_group(const struct action_ops *ops, struct snd_pcm_substream *substream, int state, int stream_lock) { struct snd_pcm_substream *s = NULL; struct snd_pcm_substream *s1; int res = 0, depth = 1; snd_pcm_group_for_each_entry(s, substream) { if (s != substream) { if (!stream_lock) mutex_lock_nested(&s->runtime->buffer_mutex, depth); else if (s->pcm->nonatomic) mutex_lock_nested(&s->self_group.mutex, depth); else spin_lock_nested(&s->self_group.lock, depth); depth++; } res = ops->pre_action(s, state); if (res < 0) goto _unlock; } snd_pcm_group_for_each_entry(s, substream) { res = ops->do_action(s, state); if (res < 0) { if (ops->undo_action) { snd_pcm_group_for_each_entry(s1, substream) { if (s1 == s) /* failed stream */ break; ops->undo_action(s1, state); } } s = NULL; /* unlock all */ goto _unlock; } } snd_pcm_group_for_each_entry(s, substream) { ops->post_action(s, state); } _unlock: /* unlock streams */ snd_pcm_group_for_each_entry(s1, substream) { if (s1 != substream) { if (!stream_lock) mutex_unlock(&s1->runtime->buffer_mutex); else if (s1->pcm->nonatomic) mutex_unlock(&s1->self_group.mutex); else spin_unlock(&s1->self_group.lock); } if (s1 == s) /* end */ break; } return res; } /* * Note: call with stream lock */ static int snd_pcm_action_single(const struct action_ops *ops, struct snd_pcm_substream *substream, int state) { int res; res = ops->pre_action(substream, state); if (res < 0) return res; res = ops->do_action(substream, state); if (res == 0) ops->post_action(substream, state); else if (ops->undo_action) ops->undo_action(substream, state); return res; } /* * Note: call with stream lock */ static int snd_pcm_action(const struct action_ops *ops, struct snd_pcm_substream *substream, int state) { int res; if (!snd_pcm_stream_linked(substream)) return snd_pcm_action_single(ops, substream, state); if (substream->pcm->nonatomic) { if (!mutex_trylock(&substream->group->mutex)) { mutex_unlock(&substream->self_group.mutex); mutex_lock(&substream->group->mutex); mutex_lock(&substream->self_group.mutex); } res = snd_pcm_action_group(ops, substream, state, 1); mutex_unlock(&substream->group->mutex); } else { if (!spin_trylock(&substream->group->lock)) { spin_unlock(&substream->self_group.lock); spin_lock(&substream->group->lock); spin_lock(&substream->self_group.lock); } res = snd_pcm_action_group(ops, substream, state, 1); spin_unlock(&substream->group->lock); } return res; } /* * Note: don't use any locks before */ static int snd_pcm_action_lock_irq(const struct action_ops *ops, struct snd_pcm_substream *substream, int state) { int res; snd_pcm_stream_lock_irq(substream); res = snd_pcm_action(ops, substream, state); snd_pcm_stream_unlock_irq(substream); return res; } /* */ static int snd_pcm_action_nonatomic(const struct action_ops *ops, struct snd_pcm_substream *substream, int state) { int res; down_read(&snd_pcm_link_rwsem); res = snd_pcm_buffer_access_lock(substream->runtime); if (res < 0) goto unlock; if (snd_pcm_stream_linked(substream)) res = snd_pcm_action_group(ops, substream, state, 0); else res = snd_pcm_action_single(ops, substream, state); snd_pcm_buffer_access_unlock(substream->runtime); unlock: up_read(&snd_pcm_link_rwsem); return res; } /* * start callbacks */ static int snd_pcm_pre_start(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->status->state != SNDRV_PCM_STATE_PREPARED) return -EBADFD; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && !snd_pcm_playback_data(substream)) return -EPIPE; runtime->trigger_tstamp_latched = false; runtime->trigger_master = substream; return 0; } static int snd_pcm_do_start(struct snd_pcm_substream *substream, int state) { if (substream->runtime->trigger_master != substream) return 0; return substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_START); } static void snd_pcm_undo_start(struct snd_pcm_substream *substream, int state) { if (substream->runtime->trigger_master == substream) substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_STOP); } static void snd_pcm_post_start(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); runtime->hw_ptr_jiffies = jiffies; runtime->hw_ptr_buffer_jiffies = (runtime->buffer_size * HZ) / runtime->rate; runtime->status->state = state; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && runtime->silence_size > 0) snd_pcm_playback_silence(substream, ULONG_MAX); snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSTART); } static const struct action_ops snd_pcm_action_start = { .pre_action = snd_pcm_pre_start, .do_action = snd_pcm_do_start, .undo_action = snd_pcm_undo_start, .post_action = snd_pcm_post_start }; /** * snd_pcm_start - start all linked streams * @substream: the PCM substream instance * * Return: Zero if successful, or a negative error code. * The stream lock must be acquired before calling this function. */ int snd_pcm_start(struct snd_pcm_substream *substream) { return snd_pcm_action(&snd_pcm_action_start, substream, SNDRV_PCM_STATE_RUNNING); } /* take the stream lock and start the streams */ static int snd_pcm_start_lock_irq(struct snd_pcm_substream *substream) { return snd_pcm_action_lock_irq(&snd_pcm_action_start, substream, SNDRV_PCM_STATE_RUNNING); } /* * stop callbacks */ static int snd_pcm_pre_stop(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; runtime->trigger_master = substream; return 0; } static int snd_pcm_do_stop(struct snd_pcm_substream *substream, int state) { if (substream->runtime->trigger_master == substream && snd_pcm_running(substream)) substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_STOP); return 0; /* unconditonally stop all substreams */ } static void snd_pcm_post_stop(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->status->state != state) { snd_pcm_trigger_tstamp(substream); runtime->status->state = state; snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSTOP); } wake_up(&runtime->sleep); wake_up(&runtime->tsleep); } static const struct action_ops snd_pcm_action_stop = { .pre_action = snd_pcm_pre_stop, .do_action = snd_pcm_do_stop, .post_action = snd_pcm_post_stop }; /** * snd_pcm_stop - try to stop all running streams in the substream group * @substream: the PCM substream instance * @state: PCM state after stopping the stream * * The state of each stream is then changed to the given state unconditionally. * * Return: Zero if successful, or a negative error code. */ int snd_pcm_stop(struct snd_pcm_substream *substream, snd_pcm_state_t state) { return snd_pcm_action(&snd_pcm_action_stop, substream, state); } EXPORT_SYMBOL(snd_pcm_stop); /** * snd_pcm_drain_done - stop the DMA only when the given stream is playback * @substream: the PCM substream * * After stopping, the state is changed to SETUP. * Unlike snd_pcm_stop(), this affects only the given stream. * * Return: Zero if succesful, or a negative error code. */ int snd_pcm_drain_done(struct snd_pcm_substream *substream) { return snd_pcm_action_single(&snd_pcm_action_stop, substream, SNDRV_PCM_STATE_SETUP); } /** * snd_pcm_stop_xrun - stop the running streams as XRUN * @substream: the PCM substream instance * * This stops the given running substream (and all linked substreams) as XRUN. * Unlike snd_pcm_stop(), this function takes the substream lock by itself. * * Return: Zero if successful, or a negative error code. */ int snd_pcm_stop_xrun(struct snd_pcm_substream *substream) { unsigned long flags; snd_pcm_stream_lock_irqsave(substream, flags); if (substream->runtime && snd_pcm_running(substream)) __snd_pcm_xrun(substream); snd_pcm_stream_unlock_irqrestore(substream, flags); return 0; } EXPORT_SYMBOL_GPL(snd_pcm_stop_xrun); /* * pause callbacks */ static int snd_pcm_pre_pause(struct snd_pcm_substream *substream, int push) { struct snd_pcm_runtime *runtime = substream->runtime; if (!(runtime->info & SNDRV_PCM_INFO_PAUSE)) return -ENOSYS; if (push) { if (runtime->status->state != SNDRV_PCM_STATE_RUNNING) return -EBADFD; } else if (runtime->status->state != SNDRV_PCM_STATE_PAUSED) return -EBADFD; runtime->trigger_master = substream; return 0; } static int snd_pcm_do_pause(struct snd_pcm_substream *substream, int push) { if (substream->runtime->trigger_master != substream) return 0; /* some drivers might use hw_ptr to recover from the pause - update the hw_ptr now */ if (push) snd_pcm_update_hw_ptr(substream); /* The jiffies check in snd_pcm_update_hw_ptr*() is done by * a delta between the current jiffies, this gives a large enough * delta, effectively to skip the check once. */ substream->runtime->hw_ptr_jiffies = jiffies - HZ * 1000; return substream->ops->trigger(substream, push ? SNDRV_PCM_TRIGGER_PAUSE_PUSH : SNDRV_PCM_TRIGGER_PAUSE_RELEASE); } static void snd_pcm_undo_pause(struct snd_pcm_substream *substream, int push) { if (substream->runtime->trigger_master == substream) substream->ops->trigger(substream, push ? SNDRV_PCM_TRIGGER_PAUSE_RELEASE : SNDRV_PCM_TRIGGER_PAUSE_PUSH); } static void snd_pcm_post_pause(struct snd_pcm_substream *substream, int push) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); if (push) { runtime->status->state = SNDRV_PCM_STATE_PAUSED; snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MPAUSE); wake_up(&runtime->sleep); wake_up(&runtime->tsleep); } else { runtime->status->state = SNDRV_PCM_STATE_RUNNING; snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MCONTINUE); } } static const struct action_ops snd_pcm_action_pause = { .pre_action = snd_pcm_pre_pause, .do_action = snd_pcm_do_pause, .undo_action = snd_pcm_undo_pause, .post_action = snd_pcm_post_pause }; /* * Push/release the pause for all linked streams. */ static int snd_pcm_pause(struct snd_pcm_substream *substream, int push) { return snd_pcm_action(&snd_pcm_action_pause, substream, push); } #ifdef CONFIG_PM /* suspend */ static int snd_pcm_pre_suspend(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; switch (runtime->status->state) { case SNDRV_PCM_STATE_SUSPENDED: return -EBUSY; /* unresumable PCM state; return -EBUSY for skipping suspend */ case SNDRV_PCM_STATE_OPEN: case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_DISCONNECTED: return -EBUSY; } runtime->trigger_master = substream; return 0; } static int snd_pcm_do_suspend(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->trigger_master != substream) return 0; if (! snd_pcm_running(substream)) return 0; substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_SUSPEND); return 0; /* suspend unconditionally */ } static void snd_pcm_post_suspend(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); runtime->status->suspended_state = runtime->status->state; runtime->status->state = SNDRV_PCM_STATE_SUSPENDED; snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSUSPEND); wake_up(&runtime->sleep); wake_up(&runtime->tsleep); } static const struct action_ops snd_pcm_action_suspend = { .pre_action = snd_pcm_pre_suspend, .do_action = snd_pcm_do_suspend, .post_action = snd_pcm_post_suspend }; /** * snd_pcm_suspend - trigger SUSPEND to all linked streams * @substream: the PCM substream * * After this call, all streams are changed to SUSPENDED state. * * Return: Zero if successful (or @substream is %NULL), or a negative error * code. */ int snd_pcm_suspend(struct snd_pcm_substream *substream) { int err; unsigned long flags; if (! substream) return 0; snd_pcm_stream_lock_irqsave(substream, flags); err = snd_pcm_action(&snd_pcm_action_suspend, substream, 0); snd_pcm_stream_unlock_irqrestore(substream, flags); return err; } EXPORT_SYMBOL(snd_pcm_suspend); /** * snd_pcm_suspend_all - trigger SUSPEND to all substreams in the given pcm * @pcm: the PCM instance * * After this call, all streams are changed to SUSPENDED state. * * Return: Zero if successful (or @pcm is %NULL), or a negative error code. */ int snd_pcm_suspend_all(struct snd_pcm *pcm) { struct snd_pcm_substream *substream; int stream, err = 0; if (! pcm) return 0; for (stream = 0; stream < 2; stream++) { for (substream = pcm->streams[stream].substream; substream; substream = substream->next) { /* FIXME: the open/close code should lock this as well */ if (substream->runtime == NULL) continue; /* * Skip BE dai link PCM's that are internal and may * not have their substream ops set. */ if (!substream->ops) continue; err = snd_pcm_suspend(substream); if (err < 0 && err != -EBUSY) return err; } } return 0; } EXPORT_SYMBOL(snd_pcm_suspend_all); /* resume */ static int snd_pcm_pre_resume(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; if (!(runtime->info & SNDRV_PCM_INFO_RESUME)) return -ENOSYS; runtime->trigger_master = substream; return 0; } static int snd_pcm_do_resume(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->trigger_master != substream) return 0; /* DMA not running previously? */ if (runtime->status->suspended_state != SNDRV_PCM_STATE_RUNNING && (runtime->status->suspended_state != SNDRV_PCM_STATE_DRAINING || substream->stream != SNDRV_PCM_STREAM_PLAYBACK)) return 0; return substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_RESUME); } static void snd_pcm_undo_resume(struct snd_pcm_substream *substream, int state) { if (substream->runtime->trigger_master == substream && snd_pcm_running(substream)) substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_SUSPEND); } static void snd_pcm_post_resume(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); runtime->status->state = runtime->status->suspended_state; snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MRESUME); } static const struct action_ops snd_pcm_action_resume = { .pre_action = snd_pcm_pre_resume, .do_action = snd_pcm_do_resume, .undo_action = snd_pcm_undo_resume, .post_action = snd_pcm_post_resume }; static int snd_pcm_resume(struct snd_pcm_substream *substream) { return snd_pcm_action_lock_irq(&snd_pcm_action_resume, substream, 0); } #else static int snd_pcm_resume(struct snd_pcm_substream *substream) { return -ENOSYS; } #endif /* CONFIG_PM */ /* * xrun ioctl * * Change the RUNNING stream(s) to XRUN state. */ static int snd_pcm_xrun(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; int result; snd_pcm_stream_lock_irq(substream); switch (runtime->status->state) { case SNDRV_PCM_STATE_XRUN: result = 0; /* already there */ break; case SNDRV_PCM_STATE_RUNNING: __snd_pcm_xrun(substream); result = 0; break; default: result = -EBADFD; } snd_pcm_stream_unlock_irq(substream); return result; } /* * reset ioctl */ static int snd_pcm_pre_reset(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; switch (runtime->status->state) { case SNDRV_PCM_STATE_RUNNING: case SNDRV_PCM_STATE_PREPARED: case SNDRV_PCM_STATE_PAUSED: case SNDRV_PCM_STATE_SUSPENDED: return 0; default: return -EBADFD; } } static int snd_pcm_do_reset(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; int err = substream->ops->ioctl(substream, SNDRV_PCM_IOCTL1_RESET, NULL); if (err < 0) return err; snd_pcm_stream_lock_irq(substream); runtime->hw_ptr_base = 0; runtime->hw_ptr_interrupt = runtime->status->hw_ptr - runtime->status->hw_ptr % runtime->period_size; runtime->silence_start = runtime->status->hw_ptr; runtime->silence_filled = 0; snd_pcm_stream_unlock_irq(substream); return 0; } static void snd_pcm_post_reset(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_stream_lock_irq(substream); runtime->control->appl_ptr = runtime->status->hw_ptr; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && runtime->silence_size > 0) snd_pcm_playback_silence(substream, ULONG_MAX); snd_pcm_stream_unlock_irq(substream); } static const struct action_ops snd_pcm_action_reset = { .pre_action = snd_pcm_pre_reset, .do_action = snd_pcm_do_reset, .post_action = snd_pcm_post_reset }; static int snd_pcm_reset(struct snd_pcm_substream *substream) { return snd_pcm_action_nonatomic(&snd_pcm_action_reset, substream, 0); } /* * prepare ioctl */ /* we use the second argument for updating f_flags */ static int snd_pcm_pre_prepare(struct snd_pcm_substream *substream, int f_flags) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN || runtime->status->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; if (snd_pcm_running(substream)) return -EBUSY; substream->f_flags = f_flags; return 0; } static int snd_pcm_do_prepare(struct snd_pcm_substream *substream, int state) { int err; err = substream->ops->prepare(substream); if (err < 0) return err; return snd_pcm_do_reset(substream, 0); } static void snd_pcm_post_prepare(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; runtime->control->appl_ptr = runtime->status->hw_ptr; snd_pcm_set_state(substream, SNDRV_PCM_STATE_PREPARED); } static const struct action_ops snd_pcm_action_prepare = { .pre_action = snd_pcm_pre_prepare, .do_action = snd_pcm_do_prepare, .post_action = snd_pcm_post_prepare }; /** * snd_pcm_prepare - prepare the PCM substream to be triggerable * @substream: the PCM substream instance * @file: file to refer f_flags * * Return: Zero if successful, or a negative error code. */ static int snd_pcm_prepare(struct snd_pcm_substream *substream, struct file *file) { int f_flags; if (file) f_flags = file->f_flags; else f_flags = substream->f_flags; snd_pcm_stream_lock_irq(substream); switch (substream->runtime->status->state) { case SNDRV_PCM_STATE_PAUSED: snd_pcm_pause(substream, 0); /* fallthru */ case SNDRV_PCM_STATE_SUSPENDED: snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP); break; } snd_pcm_stream_unlock_irq(substream); return snd_pcm_action_nonatomic(&snd_pcm_action_prepare, substream, f_flags); } /* * drain ioctl */ static int snd_pcm_pre_drain_init(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; switch (runtime->status->state) { case SNDRV_PCM_STATE_OPEN: case SNDRV_PCM_STATE_DISCONNECTED: case SNDRV_PCM_STATE_SUSPENDED: return -EBADFD; } runtime->trigger_master = substream; return 0; } static int snd_pcm_do_drain_init(struct snd_pcm_substream *substream, int state) { struct snd_pcm_runtime *runtime = substream->runtime; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { switch (runtime->status->state) { case SNDRV_PCM_STATE_PREPARED: /* start playback stream if possible */ if (! snd_pcm_playback_empty(substream)) { snd_pcm_do_start(substream, SNDRV_PCM_STATE_DRAINING); snd_pcm_post_start(substream, SNDRV_PCM_STATE_DRAINING); } else { runtime->status->state = SNDRV_PCM_STATE_SETUP; } break; case SNDRV_PCM_STATE_RUNNING: runtime->status->state = SNDRV_PCM_STATE_DRAINING; break; case SNDRV_PCM_STATE_XRUN: runtime->status->state = SNDRV_PCM_STATE_SETUP; break; default: break; } } else { /* stop running stream */ if (runtime->status->state == SNDRV_PCM_STATE_RUNNING) { int new_state = snd_pcm_capture_avail(runtime) > 0 ? SNDRV_PCM_STATE_DRAINING : SNDRV_PCM_STATE_SETUP; snd_pcm_do_stop(substream, new_state); snd_pcm_post_stop(substream, new_state); } } if (runtime->status->state == SNDRV_PCM_STATE_DRAINING && runtime->trigger_master == substream && (runtime->hw.info & SNDRV_PCM_INFO_DRAIN_TRIGGER)) return substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_DRAIN); return 0; } static void snd_pcm_post_drain_init(struct snd_pcm_substream *substream, int state) { } static const struct action_ops snd_pcm_action_drain_init = { .pre_action = snd_pcm_pre_drain_init, .do_action = snd_pcm_do_drain_init, .post_action = snd_pcm_post_drain_init }; static int snd_pcm_drop(struct snd_pcm_substream *substream); /* * Drain the stream(s). * When the substream is linked, sync until the draining of all playback streams * is finished. * After this call, all streams are supposed to be either SETUP or DRAINING * (capture only) state. */ static int snd_pcm_drain(struct snd_pcm_substream *substream, struct file *file) { struct snd_card *card; struct snd_pcm_runtime *runtime; struct snd_pcm_substream *s; wait_queue_entry_t wait; int result = 0; int nonblock = 0; card = substream->pcm->card; runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (file) { if (file->f_flags & O_NONBLOCK) nonblock = 1; } else if (substream->f_flags & O_NONBLOCK) nonblock = 1; down_read(&snd_pcm_link_rwsem); snd_pcm_stream_lock_irq(substream); /* resume pause */ if (runtime->status->state == SNDRV_PCM_STATE_PAUSED) snd_pcm_pause(substream, 0); /* pre-start/stop - all running streams are changed to DRAINING state */ result = snd_pcm_action(&snd_pcm_action_drain_init, substream, 0); if (result < 0) goto unlock; /* in non-blocking, we don't wait in ioctl but let caller poll */ if (nonblock) { result = -EAGAIN; goto unlock; } for (;;) { long tout; struct snd_pcm_runtime *to_check; if (signal_pending(current)) { result = -ERESTARTSYS; break; } /* find a substream to drain */ to_check = NULL; snd_pcm_group_for_each_entry(s, substream) { if (s->stream != SNDRV_PCM_STREAM_PLAYBACK) continue; runtime = s->runtime; if (runtime->status->state == SNDRV_PCM_STATE_DRAINING) { to_check = runtime; break; } } if (!to_check) break; /* all drained */ init_waitqueue_entry(&wait, current); add_wait_queue(&to_check->sleep, &wait); snd_pcm_stream_unlock_irq(substream); up_read(&snd_pcm_link_rwsem); if (runtime->no_period_wakeup) tout = MAX_SCHEDULE_TIMEOUT; else { tout = 10; if (runtime->rate) { long t = runtime->period_size * 2 / runtime->rate; tout = max(t, tout); } tout = msecs_to_jiffies(tout * 1000); } tout = schedule_timeout_interruptible(tout); down_read(&snd_pcm_link_rwsem); snd_pcm_stream_lock_irq(substream); remove_wait_queue(&to_check->sleep, &wait); if (card->shutdown) { result = -ENODEV; break; } if (tout == 0) { if (substream->runtime->status->state == SNDRV_PCM_STATE_SUSPENDED) result = -ESTRPIPE; else { dev_dbg(substream->pcm->card->dev, "playback drain error (DMA or IRQ trouble?)\n"); snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP); result = -EIO; } break; } } unlock: snd_pcm_stream_unlock_irq(substream); up_read(&snd_pcm_link_rwsem); return result; } /* * drop ioctl * * Immediately put all linked substreams into SETUP state. */ static int snd_pcm_drop(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; int result = 0; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN || runtime->status->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; snd_pcm_stream_lock_irq(substream); /* resume pause */ if (runtime->status->state == SNDRV_PCM_STATE_PAUSED) snd_pcm_pause(substream, 0); snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP); /* runtime->control->appl_ptr = runtime->status->hw_ptr; */ snd_pcm_stream_unlock_irq(substream); return result; } static bool is_pcm_file(struct file *file) { struct inode *inode = file_inode(file); unsigned int minor; if (!S_ISCHR(inode->i_mode) || imajor(inode) != snd_major) return false; minor = iminor(inode); return snd_lookup_minor_data(minor, SNDRV_DEVICE_TYPE_PCM_PLAYBACK) || snd_lookup_minor_data(minor, SNDRV_DEVICE_TYPE_PCM_CAPTURE); } /* * PCM link handling */ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd) { int res = 0; struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream1; struct snd_pcm_group *group; struct fd f = fdget(fd); if (!f.file) return -EBADFD; if (!is_pcm_file(f.file)) { res = -EBADFD; goto _badf; } pcm_file = f.file->private_data; substream1 = pcm_file->substream; if (substream == substream1) { res = -EINVAL; goto _badf; } group = kmalloc(sizeof(*group), GFP_KERNEL); if (!group) { res = -ENOMEM; goto _nolock; } down_write_nonfifo(&snd_pcm_link_rwsem); write_lock_irq(&snd_pcm_link_rwlock); if (substream->runtime->status->state == SNDRV_PCM_STATE_OPEN || substream->runtime->status->state != substream1->runtime->status->state || substream->pcm->nonatomic != substream1->pcm->nonatomic) { res = -EBADFD; goto _end; } if (snd_pcm_stream_linked(substream1)) { res = -EALREADY; goto _end; } if (!snd_pcm_stream_linked(substream)) { substream->group = group; group = NULL; spin_lock_init(&substream->group->lock); mutex_init(&substream->group->mutex); INIT_LIST_HEAD(&substream->group->substreams); list_add_tail(&substream->link_list, &substream->group->substreams); substream->group->count = 1; } list_add_tail(&substream1->link_list, &substream->group->substreams); substream->group->count++; substream1->group = substream->group; _end: write_unlock_irq(&snd_pcm_link_rwlock); up_write(&snd_pcm_link_rwsem); _nolock: snd_card_unref(substream1->pcm->card); kfree(group); _badf: fdput(f); return res; } static void relink_to_local(struct snd_pcm_substream *substream) { substream->group = &substream->self_group; INIT_LIST_HEAD(&substream->self_group.substreams); list_add_tail(&substream->link_list, &substream->self_group.substreams); } static int snd_pcm_unlink(struct snd_pcm_substream *substream) { struct snd_pcm_substream *s; int res = 0; down_write_nonfifo(&snd_pcm_link_rwsem); write_lock_irq(&snd_pcm_link_rwlock); if (!snd_pcm_stream_linked(substream)) { res = -EALREADY; goto _end; } list_del(&substream->link_list); substream->group->count--; if (substream->group->count == 1) { /* detach the last stream, too */ snd_pcm_group_for_each_entry(s, substream) { relink_to_local(s); break; } kfree(substream->group); } relink_to_local(substream); _end: write_unlock_irq(&snd_pcm_link_rwlock); up_write(&snd_pcm_link_rwsem); return res; } /* * hw configurator */ static int snd_pcm_hw_rule_mul(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_interval_mul(hw_param_interval_c(params, rule->deps[0]), hw_param_interval_c(params, rule->deps[1]), &t); return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_div(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_interval_div(hw_param_interval_c(params, rule->deps[0]), hw_param_interval_c(params, rule->deps[1]), &t); return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_muldivk(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_interval_muldivk(hw_param_interval_c(params, rule->deps[0]), hw_param_interval_c(params, rule->deps[1]), (unsigned long) rule->private, &t); return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_mulkdiv(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_interval_mulkdiv(hw_param_interval_c(params, rule->deps[0]), (unsigned long) rule->private, hw_param_interval_c(params, rule->deps[1]), &t); return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_format(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { unsigned int k; const struct snd_interval *i = hw_param_interval_c(params, rule->deps[0]); struct snd_mask m; struct snd_mask *mask = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); snd_mask_any(&m); for (k = 0; k <= SNDRV_PCM_FORMAT_LAST; ++k) { int bits; if (! snd_mask_test(mask, k)) continue; bits = snd_pcm_format_physical_width(k); if (bits <= 0) continue; /* ignore invalid formats */ if ((unsigned)bits < i->min || (unsigned)bits > i->max) snd_mask_reset(&m, k); } return snd_mask_refine(mask, &m); } static int snd_pcm_hw_rule_sample_bits(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; unsigned int k; t.min = UINT_MAX; t.max = 0; t.openmin = 0; t.openmax = 0; for (k = 0; k <= SNDRV_PCM_FORMAT_LAST; ++k) { int bits; if (! snd_mask_test(hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT), k)) continue; bits = snd_pcm_format_physical_width(k); if (bits <= 0) continue; /* ignore invalid formats */ if (t.min > (unsigned)bits) t.min = bits; if (t.max < (unsigned)bits) t.max = bits; } t.integer = 1; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } #if SNDRV_PCM_RATE_5512 != 1 << 0 || SNDRV_PCM_RATE_192000 != 1 << 12 #error "Change this table" #endif static const unsigned int rates[] = { 5512, 8000, 11025, 16000, 22050, 32000, 44100, 48000, 64000, 88200, 96000, 176400, 192000 }; const struct snd_pcm_hw_constraint_list snd_pcm_known_rates = { .count = ARRAY_SIZE(rates), .list = rates, }; static int snd_pcm_hw_rule_rate(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_pcm_hardware *hw = rule->private; return snd_interval_list(hw_param_interval(params, rule->var), snd_pcm_known_rates.count, snd_pcm_known_rates.list, hw->rates); } static int snd_pcm_hw_rule_buffer_bytes_max(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; struct snd_pcm_substream *substream = rule->private; t.min = 0; t.max = substream->buffer_bytes_max; t.openmin = 0; t.openmax = 0; t.integer = 1; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } int snd_pcm_hw_constraints_init(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_hw_constraints *constrs = &runtime->hw_constraints; int k, err; for (k = SNDRV_PCM_HW_PARAM_FIRST_MASK; k <= SNDRV_PCM_HW_PARAM_LAST_MASK; k++) { snd_mask_any(constrs_mask(constrs, k)); } for (k = SNDRV_PCM_HW_PARAM_FIRST_INTERVAL; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) { snd_interval_any(constrs_interval(constrs, k)); } snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_CHANNELS)); snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_BUFFER_SIZE)); snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_BUFFER_BYTES)); snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_SAMPLE_BITS)); snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_FRAME_BITS)); err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FORMAT, snd_pcm_hw_rule_format, NULL, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, snd_pcm_hw_rule_sample_bits, NULL, SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, snd_pcm_hw_rule_div, NULL, SNDRV_PCM_HW_PARAM_FRAME_BITS, SNDRV_PCM_HW_PARAM_CHANNELS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, snd_pcm_hw_rule_mul, NULL, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, SNDRV_PCM_HW_PARAM_CHANNELS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, snd_pcm_hw_rule_mulkdiv, (void*) 8, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, snd_pcm_hw_rule_mulkdiv, (void*) 8, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, snd_pcm_hw_rule_div, NULL, SNDRV_PCM_HW_PARAM_FRAME_BITS, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, snd_pcm_hw_rule_mulkdiv, (void*) 1000000, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_PERIOD_TIME, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, snd_pcm_hw_rule_mulkdiv, (void*) 1000000, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_BUFFER_TIME, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIODS, snd_pcm_hw_rule_div, NULL, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, snd_pcm_hw_rule_div, NULL, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_PERIODS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, snd_pcm_hw_rule_mulkdiv, (void*) 8, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, snd_pcm_hw_rule_muldivk, (void*) 1000000, SNDRV_PCM_HW_PARAM_PERIOD_TIME, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, snd_pcm_hw_rule_mul, NULL, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_PERIODS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, snd_pcm_hw_rule_mulkdiv, (void*) 8, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, snd_pcm_hw_rule_muldivk, (void*) 1000000, SNDRV_PCM_HW_PARAM_BUFFER_TIME, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, snd_pcm_hw_rule_muldivk, (void*) 8, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, snd_pcm_hw_rule_muldivk, (void*) 8, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_TIME, snd_pcm_hw_rule_mulkdiv, (void*) 1000000, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_TIME, snd_pcm_hw_rule_mulkdiv, (void*) 1000000, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; return 0; } int snd_pcm_hw_constraints_complete(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_hardware *hw = &runtime->hw; int err; unsigned int mask = 0; if (hw->info & SNDRV_PCM_INFO_INTERLEAVED) mask |= 1 << SNDRV_PCM_ACCESS_RW_INTERLEAVED; if (hw->info & SNDRV_PCM_INFO_NONINTERLEAVED) mask |= 1 << SNDRV_PCM_ACCESS_RW_NONINTERLEAVED; if (hw_support_mmap(substream)) { if (hw->info & SNDRV_PCM_INFO_INTERLEAVED) mask |= 1 << SNDRV_PCM_ACCESS_MMAP_INTERLEAVED; if (hw->info & SNDRV_PCM_INFO_NONINTERLEAVED) mask |= 1 << SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED; if (hw->info & SNDRV_PCM_INFO_COMPLEX) mask |= 1 << SNDRV_PCM_ACCESS_MMAP_COMPLEX; } err = snd_pcm_hw_constraint_mask(runtime, SNDRV_PCM_HW_PARAM_ACCESS, mask); if (err < 0) return err; err = snd_pcm_hw_constraint_mask64(runtime, SNDRV_PCM_HW_PARAM_FORMAT, hw->formats); if (err < 0) return err; err = snd_pcm_hw_constraint_mask(runtime, SNDRV_PCM_HW_PARAM_SUBFORMAT, 1 << SNDRV_PCM_SUBFORMAT_STD); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_CHANNELS, hw->channels_min, hw->channels_max); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_RATE, hw->rate_min, hw->rate_max); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, hw->period_bytes_min, hw->period_bytes_max); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_PERIODS, hw->periods_min, hw->periods_max); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, hw->period_bytes_min, hw->buffer_bytes_max); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, snd_pcm_hw_rule_buffer_bytes_max, substream, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, -1); if (err < 0) return err; /* FIXME: remove */ if (runtime->dma_bytes) { err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 0, runtime->dma_bytes); if (err < 0) return err; } if (!(hw->rates & (SNDRV_PCM_RATE_KNOT | SNDRV_PCM_RATE_CONTINUOUS))) { err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, snd_pcm_hw_rule_rate, hw, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; } /* FIXME: this belong to lowlevel */ snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIOD_SIZE); return 0; } static void pcm_release_private(struct snd_pcm_substream *substream) { if (snd_pcm_stream_linked(substream)) snd_pcm_unlink(substream); } void snd_pcm_release_substream(struct snd_pcm_substream *substream) { substream->ref_count--; if (substream->ref_count > 0) return; snd_pcm_drop(substream); if (substream->hw_opened) { if (substream->ops->hw_free && substream->runtime->status->state != SNDRV_PCM_STATE_OPEN) substream->ops->hw_free(substream); substream->ops->close(substream); substream->hw_opened = 0; } if (pm_qos_request_active(&substream->latency_pm_qos_req)) pm_qos_remove_request(&substream->latency_pm_qos_req); if (substream->pcm_release) { substream->pcm_release(substream); substream->pcm_release = NULL; } snd_pcm_detach_substream(substream); } EXPORT_SYMBOL(snd_pcm_release_substream); int snd_pcm_open_substream(struct snd_pcm *pcm, int stream, struct file *file, struct snd_pcm_substream **rsubstream) { struct snd_pcm_substream *substream; int err; err = snd_pcm_attach_substream(pcm, stream, file, &substream); if (err < 0) return err; if (substream->ref_count > 1) { *rsubstream = substream; return 0; } err = snd_pcm_hw_constraints_init(substream); if (err < 0) { pcm_dbg(pcm, "snd_pcm_hw_constraints_init failed\n"); goto error; } if ((err = substream->ops->open(substream)) < 0) goto error; substream->hw_opened = 1; err = snd_pcm_hw_constraints_complete(substream); if (err < 0) { pcm_dbg(pcm, "snd_pcm_hw_constraints_complete failed\n"); goto error; } *rsubstream = substream; return 0; error: snd_pcm_release_substream(substream); return err; } EXPORT_SYMBOL(snd_pcm_open_substream); static int snd_pcm_open_file(struct file *file, struct snd_pcm *pcm, int stream) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; int err; err = snd_pcm_open_substream(pcm, stream, file, &substream); if (err < 0) return err; pcm_file = kzalloc(sizeof(*pcm_file), GFP_KERNEL); if (pcm_file == NULL) { snd_pcm_release_substream(substream); return -ENOMEM; } pcm_file->substream = substream; if (substream->ref_count == 1) { substream->file = pcm_file; substream->pcm_release = pcm_release_private; } file->private_data = pcm_file; return 0; } static int snd_pcm_playback_open(struct inode *inode, struct file *file) { struct snd_pcm *pcm; int err = nonseekable_open(inode, file); if (err < 0) return err; pcm = snd_lookup_minor_data(iminor(inode), SNDRV_DEVICE_TYPE_PCM_PLAYBACK); err = snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_PLAYBACK); if (pcm) snd_card_unref(pcm->card); return err; } static int snd_pcm_capture_open(struct inode *inode, struct file *file) { struct snd_pcm *pcm; int err = nonseekable_open(inode, file); if (err < 0) return err; pcm = snd_lookup_minor_data(iminor(inode), SNDRV_DEVICE_TYPE_PCM_CAPTURE); err = snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_CAPTURE); if (pcm) snd_card_unref(pcm->card); return err; } static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream) { int err; wait_queue_entry_t wait; if (pcm == NULL) { err = -ENODEV; goto __error1; } err = snd_card_file_add(pcm->card, file); if (err < 0) goto __error1; if (!try_module_get(pcm->card->module)) { err = -EFAULT; goto __error2; } init_waitqueue_entry(&wait, current); add_wait_queue(&pcm->open_wait, &wait); mutex_lock(&pcm->open_mutex); while (1) { err = snd_pcm_open_file(file, pcm, stream); if (err >= 0) break; if (err == -EAGAIN) { if (file->f_flags & O_NONBLOCK) { err = -EBUSY; break; } } else break; set_current_state(TASK_INTERRUPTIBLE); mutex_unlock(&pcm->open_mutex); schedule(); mutex_lock(&pcm->open_mutex); if (pcm->card->shutdown) { err = -ENODEV; break; } if (signal_pending(current)) { err = -ERESTARTSYS; break; } } remove_wait_queue(&pcm->open_wait, &wait); mutex_unlock(&pcm->open_mutex); if (err < 0) goto __error; return err; __error: module_put(pcm->card->module); __error2: snd_card_file_remove(pcm->card, file); __error1: return err; } static int snd_pcm_release(struct inode *inode, struct file *file) { struct snd_pcm *pcm; struct snd_pcm_substream *substream; struct snd_pcm_file *pcm_file; pcm_file = file->private_data; substream = pcm_file->substream; if (snd_BUG_ON(!substream)) return -ENXIO; pcm = substream->pcm; mutex_lock(&pcm->open_mutex); snd_pcm_release_substream(substream); kfree(pcm_file); mutex_unlock(&pcm->open_mutex); wake_up(&pcm->open_wait); module_put(pcm->card->module); snd_card_file_remove(pcm->card, file); return 0; } /* check and update PCM state; return 0 or a negative error * call this inside PCM lock */ static int do_pcm_hwsync(struct snd_pcm_substream *substream) { switch (substream->runtime->status->state) { case SNDRV_PCM_STATE_DRAINING: if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) return -EBADFD; /* Fall through */ case SNDRV_PCM_STATE_RUNNING: return snd_pcm_update_hw_ptr(substream); case SNDRV_PCM_STATE_PREPARED: case SNDRV_PCM_STATE_PAUSED: return 0; case SNDRV_PCM_STATE_SUSPENDED: return -ESTRPIPE; case SNDRV_PCM_STATE_XRUN: return -EPIPE; default: return -EBADFD; } } /* increase the appl_ptr; returns the processed frames or a negative error */ static snd_pcm_sframes_t forward_appl_ptr(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames, snd_pcm_sframes_t avail) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t appl_ptr; int ret; if (avail <= 0) return 0; if (frames > (snd_pcm_uframes_t)avail) frames = avail; appl_ptr = runtime->control->appl_ptr + frames; if (appl_ptr >= (snd_pcm_sframes_t)runtime->boundary) appl_ptr -= runtime->boundary; ret = pcm_lib_apply_appl_ptr(substream, appl_ptr); return ret < 0 ? ret : frames; } /* decrease the appl_ptr; returns the processed frames or zero for error */ static snd_pcm_sframes_t rewind_appl_ptr(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames, snd_pcm_sframes_t avail) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t appl_ptr; int ret; if (avail <= 0) return 0; if (frames > (snd_pcm_uframes_t)avail) frames = avail; appl_ptr = runtime->control->appl_ptr - frames; if (appl_ptr < 0) appl_ptr += runtime->boundary; ret = pcm_lib_apply_appl_ptr(substream, appl_ptr); /* NOTE: we return zero for errors because PulseAudio gets depressed * upon receiving an error from rewind ioctl and stops processing * any longer. Returning zero means that no rewind is done, so * it's not absolutely wrong to answer like that. */ return ret < 0 ? 0 : frames; } static snd_pcm_sframes_t snd_pcm_rewind(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames) { snd_pcm_sframes_t ret; if (frames == 0) return 0; snd_pcm_stream_lock_irq(substream); ret = do_pcm_hwsync(substream); if (!ret) ret = rewind_appl_ptr(substream, frames, snd_pcm_hw_avail(substream)); snd_pcm_stream_unlock_irq(substream); return ret; } static snd_pcm_sframes_t snd_pcm_forward(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames) { snd_pcm_sframes_t ret; if (frames == 0) return 0; snd_pcm_stream_lock_irq(substream); ret = do_pcm_hwsync(substream); if (!ret) ret = forward_appl_ptr(substream, frames, snd_pcm_avail(substream)); snd_pcm_stream_unlock_irq(substream); return ret; } static int snd_pcm_hwsync(struct snd_pcm_substream *substream) { int err; snd_pcm_stream_lock_irq(substream); err = do_pcm_hwsync(substream); snd_pcm_stream_unlock_irq(substream); return err; } static int snd_pcm_delay(struct snd_pcm_substream *substream, snd_pcm_sframes_t *delay) { int err; snd_pcm_sframes_t n = 0; snd_pcm_stream_lock_irq(substream); err = do_pcm_hwsync(substream); if (!err) n = snd_pcm_calc_delay(substream); snd_pcm_stream_unlock_irq(substream); if (!err) *delay = n; return err; } static int snd_pcm_sync_ptr(struct snd_pcm_substream *substream, struct snd_pcm_sync_ptr __user *_sync_ptr) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_sync_ptr sync_ptr; volatile struct snd_pcm_mmap_status *status; volatile struct snd_pcm_mmap_control *control; int err; memset(&sync_ptr, 0, sizeof(sync_ptr)); if (get_user(sync_ptr.flags, (unsigned __user *)&(_sync_ptr->flags))) return -EFAULT; if (copy_from_user(&sync_ptr.c.control, &(_sync_ptr->c.control), sizeof(struct snd_pcm_mmap_control))) return -EFAULT; status = runtime->status; control = runtime->control; if (sync_ptr.flags & SNDRV_PCM_SYNC_PTR_HWSYNC) { err = snd_pcm_hwsync(substream); if (err < 0) return err; } snd_pcm_stream_lock_irq(substream); if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_APPL)) { err = pcm_lib_apply_appl_ptr(substream, sync_ptr.c.control.appl_ptr); if (err < 0) { snd_pcm_stream_unlock_irq(substream); return err; } } else { sync_ptr.c.control.appl_ptr = control->appl_ptr; } if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN)) control->avail_min = sync_ptr.c.control.avail_min; else sync_ptr.c.control.avail_min = control->avail_min; sync_ptr.s.status.state = status->state; sync_ptr.s.status.hw_ptr = status->hw_ptr; sync_ptr.s.status.tstamp = status->tstamp; sync_ptr.s.status.suspended_state = status->suspended_state; sync_ptr.s.status.audio_tstamp = status->audio_tstamp; snd_pcm_stream_unlock_irq(substream); if (copy_to_user(_sync_ptr, &sync_ptr, sizeof(sync_ptr))) return -EFAULT; return 0; } static int snd_pcm_tstamp(struct snd_pcm_substream *substream, int __user *_arg) { struct snd_pcm_runtime *runtime = substream->runtime; int arg; if (get_user(arg, _arg)) return -EFAULT; if (arg < 0 || arg > SNDRV_PCM_TSTAMP_TYPE_LAST) return -EINVAL; runtime->tstamp_type = arg; return 0; } static int snd_pcm_xferi_frames_ioctl(struct snd_pcm_substream *substream, struct snd_xferi __user *_xferi) { struct snd_xferi xferi; struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t result; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (put_user(0, &_xferi->result)) return -EFAULT; if (copy_from_user(&xferi, _xferi, sizeof(xferi))) return -EFAULT; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) result = snd_pcm_lib_write(substream, xferi.buf, xferi.frames); else result = snd_pcm_lib_read(substream, xferi.buf, xferi.frames); __put_user(result, &_xferi->result); return result < 0 ? result : 0; } static int snd_pcm_xfern_frames_ioctl(struct snd_pcm_substream *substream, struct snd_xfern __user *_xfern) { struct snd_xfern xfern; struct snd_pcm_runtime *runtime = substream->runtime; void *bufs; snd_pcm_sframes_t result; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (runtime->channels > 128) return -EINVAL; if (put_user(0, &_xfern->result)) return -EFAULT; if (copy_from_user(&xfern, _xfern, sizeof(xfern))) return -EFAULT; bufs = memdup_user(xfern.bufs, sizeof(void *) * runtime->channels); if (IS_ERR(bufs)) return PTR_ERR(bufs); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) result = snd_pcm_lib_writev(substream, bufs, xfern.frames); else result = snd_pcm_lib_readv(substream, bufs, xfern.frames); kfree(bufs); __put_user(result, &_xfern->result); return result < 0 ? result : 0; } static int snd_pcm_rewind_ioctl(struct snd_pcm_substream *substream, snd_pcm_uframes_t __user *_frames) { snd_pcm_uframes_t frames; snd_pcm_sframes_t result; if (get_user(frames, _frames)) return -EFAULT; if (put_user(0, _frames)) return -EFAULT; result = snd_pcm_rewind(substream, frames); __put_user(result, _frames); return result < 0 ? result : 0; } static int snd_pcm_forward_ioctl(struct snd_pcm_substream *substream, snd_pcm_uframes_t __user *_frames) { snd_pcm_uframes_t frames; snd_pcm_sframes_t result; if (get_user(frames, _frames)) return -EFAULT; if (put_user(0, _frames)) return -EFAULT; result = snd_pcm_forward(substream, frames); __put_user(result, _frames); return result < 0 ? result : 0; } static int snd_pcm_common_ioctl(struct file *file, struct snd_pcm_substream *substream, unsigned int cmd, void __user *arg) { struct snd_pcm_file *pcm_file = file->private_data; int res; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; res = snd_power_wait(substream->pcm->card, SNDRV_CTL_POWER_D0); if (res < 0) return res; switch (cmd) { case SNDRV_PCM_IOCTL_PVERSION: return put_user(SNDRV_PCM_VERSION, (int __user *)arg) ? -EFAULT : 0; case SNDRV_PCM_IOCTL_INFO: return snd_pcm_info_user(substream, arg); case SNDRV_PCM_IOCTL_TSTAMP: /* just for compatibility */ return 0; case SNDRV_PCM_IOCTL_TTSTAMP: return snd_pcm_tstamp(substream, arg); case SNDRV_PCM_IOCTL_USER_PVERSION: if (get_user(pcm_file->user_pversion, (unsigned int __user *)arg)) return -EFAULT; return 0; case SNDRV_PCM_IOCTL_HW_REFINE: return snd_pcm_hw_refine_user(substream, arg); case SNDRV_PCM_IOCTL_HW_PARAMS: return snd_pcm_hw_params_user(substream, arg); case SNDRV_PCM_IOCTL_HW_FREE: return snd_pcm_hw_free(substream); case SNDRV_PCM_IOCTL_SW_PARAMS: return snd_pcm_sw_params_user(substream, arg); case SNDRV_PCM_IOCTL_STATUS: return snd_pcm_status_user(substream, arg, false); case SNDRV_PCM_IOCTL_STATUS_EXT: return snd_pcm_status_user(substream, arg, true); case SNDRV_PCM_IOCTL_CHANNEL_INFO: return snd_pcm_channel_info_user(substream, arg); case SNDRV_PCM_IOCTL_PREPARE: return snd_pcm_prepare(substream, file); case SNDRV_PCM_IOCTL_RESET: return snd_pcm_reset(substream); case SNDRV_PCM_IOCTL_START: return snd_pcm_start_lock_irq(substream); case SNDRV_PCM_IOCTL_LINK: return snd_pcm_link(substream, (int)(unsigned long) arg); case SNDRV_PCM_IOCTL_UNLINK: return snd_pcm_unlink(substream); case SNDRV_PCM_IOCTL_RESUME: return snd_pcm_resume(substream); case SNDRV_PCM_IOCTL_XRUN: return snd_pcm_xrun(substream); case SNDRV_PCM_IOCTL_HWSYNC: return snd_pcm_hwsync(substream); case SNDRV_PCM_IOCTL_DELAY: { snd_pcm_sframes_t delay; snd_pcm_sframes_t __user *res = arg; int err; err = snd_pcm_delay(substream, &delay); if (err) return err; if (put_user(delay, res)) return -EFAULT; return 0; } case SNDRV_PCM_IOCTL_SYNC_PTR: return snd_pcm_sync_ptr(substream, arg); #ifdef CONFIG_SND_SUPPORT_OLD_API case SNDRV_PCM_IOCTL_HW_REFINE_OLD: return snd_pcm_hw_refine_old_user(substream, arg); case SNDRV_PCM_IOCTL_HW_PARAMS_OLD: return snd_pcm_hw_params_old_user(substream, arg); #endif case SNDRV_PCM_IOCTL_DRAIN: return snd_pcm_drain(substream, file); case SNDRV_PCM_IOCTL_DROP: return snd_pcm_drop(substream); case SNDRV_PCM_IOCTL_PAUSE: return snd_pcm_action_lock_irq(&snd_pcm_action_pause, substream, (int)(unsigned long)arg); case SNDRV_PCM_IOCTL_WRITEI_FRAMES: case SNDRV_PCM_IOCTL_READI_FRAMES: return snd_pcm_xferi_frames_ioctl(substream, arg); case SNDRV_PCM_IOCTL_WRITEN_FRAMES: case SNDRV_PCM_IOCTL_READN_FRAMES: return snd_pcm_xfern_frames_ioctl(substream, arg); case SNDRV_PCM_IOCTL_REWIND: return snd_pcm_rewind_ioctl(substream, arg); case SNDRV_PCM_IOCTL_FORWARD: return snd_pcm_forward_ioctl(substream, arg); } pcm_dbg(substream->pcm, "unknown ioctl = 0x%x\n", cmd); return -ENOTTY; } static long snd_pcm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_pcm_file *pcm_file; pcm_file = file->private_data; if (((cmd >> 8) & 0xff) != 'A') return -ENOTTY; return snd_pcm_common_ioctl(file, pcm_file->substream, cmd, (void __user *)arg); } /** * snd_pcm_kernel_ioctl - Execute PCM ioctl in the kernel-space * @substream: PCM substream * @cmd: IOCTL cmd * @arg: IOCTL argument * * The function is provided primarily for OSS layer and USB gadget drivers, * and it allows only the limited set of ioctls (hw_params, sw_params, * prepare, start, drain, drop, forward). */ int snd_pcm_kernel_ioctl(struct snd_pcm_substream *substream, unsigned int cmd, void *arg) { snd_pcm_uframes_t *frames = arg; snd_pcm_sframes_t result; switch (cmd) { case SNDRV_PCM_IOCTL_FORWARD: { /* provided only for OSS; capture-only and no value returned */ if (substream->stream != SNDRV_PCM_STREAM_CAPTURE) return -EINVAL; result = snd_pcm_forward(substream, *frames); return result < 0 ? result : 0; } case SNDRV_PCM_IOCTL_HW_PARAMS: return snd_pcm_hw_params(substream, arg); case SNDRV_PCM_IOCTL_SW_PARAMS: return snd_pcm_sw_params(substream, arg); case SNDRV_PCM_IOCTL_PREPARE: return snd_pcm_prepare(substream, NULL); case SNDRV_PCM_IOCTL_START: return snd_pcm_start_lock_irq(substream); case SNDRV_PCM_IOCTL_DRAIN: return snd_pcm_drain(substream, NULL); case SNDRV_PCM_IOCTL_DROP: return snd_pcm_drop(substream); case SNDRV_PCM_IOCTL_DELAY: return snd_pcm_delay(substream, frames); default: return -EINVAL; } } EXPORT_SYMBOL(snd_pcm_kernel_ioctl); static ssize_t snd_pcm_read(struct file *file, char __user *buf, size_t count, loff_t * offset) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t result; pcm_file = file->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (!frame_aligned(runtime, count)) return -EINVAL; count = bytes_to_frames(runtime, count); result = snd_pcm_lib_read(substream, buf, count); if (result > 0) result = frames_to_bytes(runtime, result); return result; } static ssize_t snd_pcm_write(struct file *file, const char __user *buf, size_t count, loff_t * offset) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t result; pcm_file = file->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (!frame_aligned(runtime, count)) return -EINVAL; count = bytes_to_frames(runtime, count); result = snd_pcm_lib_write(substream, buf, count); if (result > 0) result = frames_to_bytes(runtime, result); return result; } static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t result; unsigned long i; void __user **bufs; snd_pcm_uframes_t frames; pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (!iter_is_iovec(to)) return -EINVAL; if (to->nr_segs > 1024 || to->nr_segs != runtime->channels) return -EINVAL; if (!frame_aligned(runtime, to->iov->iov_len)) return -EINVAL; frames = bytes_to_samples(runtime, to->iov->iov_len); bufs = kmalloc_array(to->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; for (i = 0; i < to->nr_segs; ++i) bufs[i] = to->iov[i].iov_base; result = snd_pcm_lib_readv(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); kfree(bufs); return result; } static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t result; unsigned long i; void __user **bufs; snd_pcm_uframes_t frames; pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (!iter_is_iovec(from)) return -EINVAL; if (from->nr_segs > 128 || from->nr_segs != runtime->channels || !frame_aligned(runtime, from->iov->iov_len)) return -EINVAL; frames = bytes_to_samples(runtime, from->iov->iov_len); bufs = kmalloc_array(from->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; for (i = 0; i < from->nr_segs; ++i) bufs[i] = from->iov[i].iov_base; result = snd_pcm_lib_writev(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); kfree(bufs); return result; } static __poll_t snd_pcm_poll(struct file *file, poll_table *wait) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; __poll_t mask, ok; snd_pcm_uframes_t avail; pcm_file = file->private_data; substream = pcm_file->substream; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) ok = EPOLLOUT | EPOLLWRNORM; else ok = EPOLLIN | EPOLLRDNORM; if (PCM_RUNTIME_CHECK(substream)) return ok | EPOLLERR; runtime = substream->runtime; poll_wait(file, &runtime->sleep, wait); mask = 0; snd_pcm_stream_lock_irq(substream); avail = snd_pcm_avail(substream); switch (runtime->status->state) { case SNDRV_PCM_STATE_RUNNING: case SNDRV_PCM_STATE_PREPARED: case SNDRV_PCM_STATE_PAUSED: if (avail >= runtime->control->avail_min) mask = ok; break; case SNDRV_PCM_STATE_DRAINING: if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) { mask = ok; if (!avail) mask |= EPOLLERR; } break; default: mask = ok | EPOLLERR; break; } snd_pcm_stream_unlock_irq(substream); return mask; } /* * mmap support */ /* * Only on coherent architectures, we can mmap the status and the control records * for effcient data transfer. On others, we have to use HWSYNC ioctl... */ #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_ALPHA) /* * mmap status record */ static vm_fault_t snd_pcm_mmap_status_fault(struct vm_fault *vmf) { struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; if (substream == NULL) return VM_FAULT_SIGBUS; runtime = substream->runtime; vmf->page = virt_to_page(runtime->status); get_page(vmf->page); return 0; } static const struct vm_operations_struct snd_pcm_vm_ops_status = { .fault = snd_pcm_mmap_status_fault, }; static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { long size; if (!(area->vm_flags & VM_READ)) return -EINVAL; size = area->vm_end - area->vm_start; if (size != PAGE_ALIGN(sizeof(struct snd_pcm_mmap_status))) return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } /* * mmap control record */ static vm_fault_t snd_pcm_mmap_control_fault(struct vm_fault *vmf) { struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; if (substream == NULL) return VM_FAULT_SIGBUS; runtime = substream->runtime; vmf->page = virt_to_page(runtime->control); get_page(vmf->page); return 0; } static const struct vm_operations_struct snd_pcm_vm_ops_control = { .fault = snd_pcm_mmap_control_fault, }; static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { long size; if (!(area->vm_flags & VM_READ)) return -EINVAL; size = area->vm_end - area->vm_start; if (size != PAGE_ALIGN(sizeof(struct snd_pcm_mmap_control))) return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_control; area->vm_private_data = substream; area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } static bool pcm_status_mmap_allowed(struct snd_pcm_file *pcm_file) { if (pcm_file->no_compat_mmap) return false; /* See pcm_control_mmap_allowed() below. * Since older alsa-lib requires both status and control mmaps to be * coupled, we have to disable the status mmap for old alsa-lib, too. */ if (pcm_file->user_pversion < SNDRV_PROTOCOL_VERSION(2, 0, 14) && (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_SYNC_APPLPTR)) return false; return true; } static bool pcm_control_mmap_allowed(struct snd_pcm_file *pcm_file) { if (pcm_file->no_compat_mmap) return false; /* Disallow the control mmap when SYNC_APPLPTR flag is set; * it enforces the user-space to fall back to snd_pcm_sync_ptr(), * thus it effectively assures the manual update of appl_ptr. */ if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_SYNC_APPLPTR) return false; return true; } #else /* ! coherent mmap */ /* * don't support mmap for status and control records. */ #define pcm_status_mmap_allowed(pcm_file) false #define pcm_control_mmap_allowed(pcm_file) false static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { return -ENXIO; } static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { return -ENXIO; } #endif /* coherent mmap */ static inline struct page * snd_pcm_default_page_ops(struct snd_pcm_substream *substream, unsigned long ofs) { void *vaddr = substream->runtime->dma_area + ofs; return virt_to_page(vaddr); } /* * fault callback for mmapping a RAM page */ static vm_fault_t snd_pcm_mmap_data_fault(struct vm_fault *vmf) { struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; unsigned long offset; struct page * page; size_t dma_bytes; if (substream == NULL) return VM_FAULT_SIGBUS; runtime = substream->runtime; offset = vmf->pgoff << PAGE_SHIFT; dma_bytes = PAGE_ALIGN(runtime->dma_bytes); if (offset > dma_bytes - PAGE_SIZE) return VM_FAULT_SIGBUS; if (substream->ops->page) page = substream->ops->page(substream, offset); else page = snd_pcm_default_page_ops(substream, offset); if (!page) return VM_FAULT_SIGBUS; get_page(page); vmf->page = page; return 0; } static const struct vm_operations_struct snd_pcm_vm_ops_data = { .open = snd_pcm_mmap_data_open, .close = snd_pcm_mmap_data_close, }; static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { .open = snd_pcm_mmap_data_open, .close = snd_pcm_mmap_data_close, .fault = snd_pcm_mmap_data_fault, }; /* * mmap the DMA buffer on RAM */ /** * snd_pcm_lib_default_mmap - Default PCM data mmap function * @substream: PCM substream * @area: VMA * * This is the default mmap handler for PCM data. When mmap pcm_ops is NULL, * this function is invoked implicitly. */ int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, struct vm_area_struct *area) { area->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; #ifdef CONFIG_GENERIC_ALLOCATOR if (substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV_IRAM) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return remap_pfn_range(area, area->vm_start, substream->dma_buffer.addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } #endif /* CONFIG_GENERIC_ALLOCATOR */ #ifndef CONFIG_X86 /* for avoiding warnings arch/x86/mm/pat.c */ if (IS_ENABLED(CONFIG_HAS_DMA) && !substream->ops->page && substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV) return dma_mmap_coherent(substream->dma_buffer.dev.dev, area, substream->runtime->dma_area, substream->runtime->dma_addr, substream->runtime->dma_bytes); #endif /* CONFIG_X86 */ /* mmap with fault handler */ area->vm_ops = &snd_pcm_vm_ops_data_fault; return 0; } EXPORT_SYMBOL_GPL(snd_pcm_lib_default_mmap); /* * mmap the DMA buffer on I/O memory area */ #if SNDRV_PCM_INFO_MMAP_IOMEM /** * snd_pcm_lib_mmap_iomem - Default PCM data mmap function for I/O mem * @substream: PCM substream * @area: VMA * * When your hardware uses the iomapped pages as the hardware buffer and * wants to mmap it, pass this function as mmap pcm_ops. Note that this * is supposed to work only on limited architectures. */ int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream, struct vm_area_struct *area) { struct snd_pcm_runtime *runtime = substream->runtime; area->vm_page_prot = pgprot_noncached(area->vm_page_prot); return vm_iomap_memory(area, runtime->dma_addr, runtime->dma_bytes); } EXPORT_SYMBOL(snd_pcm_lib_mmap_iomem); #endif /* SNDRV_PCM_INFO_MMAP */ /* * mmap DMA buffer */ int snd_pcm_mmap_data(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { struct snd_pcm_runtime *runtime; long size; unsigned long offset; size_t dma_bytes; int err; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { if (!(area->vm_flags & (VM_WRITE|VM_READ))) return -EINVAL; } else { if (!(area->vm_flags & VM_READ)) return -EINVAL; } runtime = substream->runtime; if (runtime->status->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (!(runtime->info & SNDRV_PCM_INFO_MMAP)) return -ENXIO; if (runtime->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED || runtime->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED) return -EINVAL; size = area->vm_end - area->vm_start; offset = area->vm_pgoff << PAGE_SHIFT; dma_bytes = PAGE_ALIGN(runtime->dma_bytes); if ((size_t)size > dma_bytes) return -EINVAL; if (offset > dma_bytes - size) return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_data; area->vm_private_data = substream; if (substream->ops->mmap) err = substream->ops->mmap(substream, area); else err = snd_pcm_lib_default_mmap(substream, area); if (!err) atomic_inc(&substream->mmap_count); return err; } EXPORT_SYMBOL(snd_pcm_mmap_data); static int snd_pcm_mmap(struct file *file, struct vm_area_struct *area) { struct snd_pcm_file * pcm_file; struct snd_pcm_substream *substream; unsigned long offset; pcm_file = file->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; offset = area->vm_pgoff << PAGE_SHIFT; switch (offset) { case SNDRV_PCM_MMAP_OFFSET_STATUS: if (!pcm_status_mmap_allowed(pcm_file)) return -ENXIO; return snd_pcm_mmap_status(substream, file, area); case SNDRV_PCM_MMAP_OFFSET_CONTROL: if (!pcm_control_mmap_allowed(pcm_file)) return -ENXIO; return snd_pcm_mmap_control(substream, file, area); default: return snd_pcm_mmap_data(substream, file, area); } return 0; } static int snd_pcm_fasync(int fd, struct file * file, int on) { struct snd_pcm_file * pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; pcm_file = file->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; return fasync_helper(fd, file, on, &runtime->fasync); } /* * ioctl32 compat */ #ifdef CONFIG_COMPAT #include "pcm_compat.c" #else #define snd_pcm_ioctl_compat NULL #endif /* * To be removed helpers to keep binary compatibility */ #ifdef CONFIG_SND_SUPPORT_OLD_API #define __OLD_TO_NEW_MASK(x) ((x&7)|((x&0x07fffff8)<<5)) #define __NEW_TO_OLD_MASK(x) ((x&7)|((x&0xffffff00)>>5)) static void snd_pcm_hw_convert_from_old_params(struct snd_pcm_hw_params *params, struct snd_pcm_hw_params_old *oparams) { unsigned int i; memset(params, 0, sizeof(*params)); params->flags = oparams->flags; for (i = 0; i < ARRAY_SIZE(oparams->masks); i++) params->masks[i].bits[0] = oparams->masks[i]; memcpy(params->intervals, oparams->intervals, sizeof(oparams->intervals)); params->rmask = __OLD_TO_NEW_MASK(oparams->rmask); params->cmask = __OLD_TO_NEW_MASK(oparams->cmask); params->info = oparams->info; params->msbits = oparams->msbits; params->rate_num = oparams->rate_num; params->rate_den = oparams->rate_den; params->fifo_size = oparams->fifo_size; } static void snd_pcm_hw_convert_to_old_params(struct snd_pcm_hw_params_old *oparams, struct snd_pcm_hw_params *params) { unsigned int i; memset(oparams, 0, sizeof(*oparams)); oparams->flags = params->flags; for (i = 0; i < ARRAY_SIZE(oparams->masks); i++) oparams->masks[i] = params->masks[i].bits[0]; memcpy(oparams->intervals, params->intervals, sizeof(oparams->intervals)); oparams->rmask = __NEW_TO_OLD_MASK(params->rmask); oparams->cmask = __NEW_TO_OLD_MASK(params->cmask); oparams->info = params->info; oparams->msbits = params->msbits; oparams->rate_num = params->rate_num; oparams->rate_den = params->rate_den; oparams->fifo_size = params->fifo_size; } static int snd_pcm_hw_refine_old_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params_old __user * _oparams) { struct snd_pcm_hw_params *params; struct snd_pcm_hw_params_old *oparams = NULL; int err; params = kmalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; oparams = memdup_user(_oparams, sizeof(*oparams)); if (IS_ERR(oparams)) { err = PTR_ERR(oparams); goto out; } snd_pcm_hw_convert_from_old_params(params, oparams); err = snd_pcm_hw_refine(substream, params); if (err < 0) goto out_old; err = fixup_unreferenced_params(substream, params); if (err < 0) goto out_old; snd_pcm_hw_convert_to_old_params(oparams, params); if (copy_to_user(_oparams, oparams, sizeof(*oparams))) err = -EFAULT; out_old: kfree(oparams); out: kfree(params); return err; } static int snd_pcm_hw_params_old_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params_old __user * _oparams) { struct snd_pcm_hw_params *params; struct snd_pcm_hw_params_old *oparams = NULL; int err; params = kmalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; oparams = memdup_user(_oparams, sizeof(*oparams)); if (IS_ERR(oparams)) { err = PTR_ERR(oparams); goto out; } snd_pcm_hw_convert_from_old_params(params, oparams); err = snd_pcm_hw_params(substream, params); if (err < 0) goto out_old; snd_pcm_hw_convert_to_old_params(oparams, params); if (copy_to_user(_oparams, oparams, sizeof(*oparams))) err = -EFAULT; out_old: kfree(oparams); out: kfree(params); return err; } #endif /* CONFIG_SND_SUPPORT_OLD_API */ #ifndef CONFIG_MMU static unsigned long snd_pcm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct snd_pcm_file *pcm_file = file->private_data; struct snd_pcm_substream *substream = pcm_file->substream; struct snd_pcm_runtime *runtime = substream->runtime; unsigned long offset = pgoff << PAGE_SHIFT; switch (offset) { case SNDRV_PCM_MMAP_OFFSET_STATUS: return (unsigned long)runtime->status; case SNDRV_PCM_MMAP_OFFSET_CONTROL: return (unsigned long)runtime->control; default: return (unsigned long)runtime->dma_area + offset; } } #else # define snd_pcm_get_unmapped_area NULL #endif /* * Register section */ const struct file_operations snd_pcm_f_ops[2] = { { .owner = THIS_MODULE, .write = snd_pcm_write, .write_iter = snd_pcm_writev, .open = snd_pcm_playback_open, .release = snd_pcm_release, .llseek = no_llseek, .poll = snd_pcm_poll, .unlocked_ioctl = snd_pcm_ioctl, .compat_ioctl = snd_pcm_ioctl_compat, .mmap = snd_pcm_mmap, .fasync = snd_pcm_fasync, .get_unmapped_area = snd_pcm_get_unmapped_area, }, { .owner = THIS_MODULE, .read = snd_pcm_read, .read_iter = snd_pcm_readv, .open = snd_pcm_capture_open, .release = snd_pcm_release, .llseek = no_llseek, .poll = snd_pcm_poll, .unlocked_ioctl = snd_pcm_ioctl, .compat_ioctl = snd_pcm_ioctl_compat, .mmap = snd_pcm_mmap, .fasync = snd_pcm_fasync, .get_unmapped_area = snd_pcm_get_unmapped_area, } }; |
88 122 10 38 21 21 20 9 10 52 10 51 22 52 3 64 63 3 63 4 1 169 130 169 189 4 3 27 22 33 4 580 580 10 10 10 10 10 10 10 10 61 557 557 9 557 38 3 557 557 556 557 55 54 64 64 4 4 4 4 4 4 4 54 3 54 47 47 47 47 47 46 47 47 55 55 4 55 54 10 10 10 10 10 10 10 55 10 10 10 10 10 10 10 98 54 54 157 151 157 157 75 84 84 157 157 56 66 22 22 22 66 66 60 60 60 60 60 60 60 68 84 151 69 84 109 152 19 132 151 25 25 18 18 25 25 25 25 7 7 60 60 59 60 21 21 60 60 1 1 1 1 1 54 54 54 53 54 46 7 46 6 6 1 5 38 95 94 7 7 60 60 60 115 115 115 115 115 128 128 39 1 38 89 3 54 55 55 55 39 39 39 39 39 1 1 1 1 1 1 1 1 55 39 1 1 38 38 38 38 1 55 39 51 2 55 55 55 39 55 53 54 55 55 55 55 55 55 55 55 55 55 55 18 39 39 39 39 1 55 55 55 55 54 55 60 62 59 58 10 59 45 59 59 10 59 59 59 3 3 2 2 1 110 110 110 110 110 110 60 60 60 60 21 21 60 60 60 21 21 21 18 18 18 8 8 7 5 2 1 6 1 17 17 17 5 5 17 1 1 1 13 17 18 1 1 1 2 2 3 1 1 1 1 3 2 2 2 4 2 1 1 2 1 2 1 1 1 3 1 43 43 8 6 145 145 41 38 40 3 2 1 40 13 42 42 9 7 1 1 60 60 60 60 60 60 39 66 66 66 66 66 66 66 66 10 10 10 2 9 328 54 110 328 326 328 168 168 167 167 167 11 11 9 9 5 9 184 184 184 184 184 101 101 27 2 32 32 9 9 32 31 25 2 23 25 10 8 10 32 25 30 25 25 25 25 2 2 32 32 25 25 25 24 25 25 24 32 32 25 10 2 15 32 32 24 32 25 32 25 24 24 24 5 2 11 11 11 11 11 11 11 10 1 11 10 10 54 54 52 54 44 44 41 1 5 44 1 13 11 10 3 9 13 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 60 60 60 39 60 60 60 60 1 60 60 21 60 61 61 61 61 61 61 60 60 60 60 60 60 61 61 7 7 7 7 27 25 25 25 25 27 27 27 27 27 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 888 519 519 519 519 519 519 23 23 23 42 42 41 29 29 26 4 2 2 4 46 39 28 46 3005 3007 3006 3007 908 4 7 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 | /* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Notifications support * Copyright (C) 2009 Nokia Corporation * Author: Kirill A. Shutemov * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "cgroup-internal.h" #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/magic.h> #include <linux/mutex.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/percpu-rwsem.h> #include <linux/string.h> #include <linux/hashtable.h> #include <linux/idr.h> #include <linux/kthread.h> #include <linux/atomic.h> #include <linux/cpuset.h> #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> #include <linux/sched/cputime.h> #include <net/sock.h> #define CREATE_TRACE_POINTS #include <trace/events/cgroup.h> #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); #ifdef CONFIG_PROVE_RCU EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif DEFINE_SPINLOCK(trace_cgroup_path_lock); char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; /* * Protects cgroup_idr and css_idr so that IDs can be released without * grabbing cgroup_mutex. */ static DEFINE_SPINLOCK(cgroup_idr_lock); /* * Protects cgroup_file->kn for !self csses. It synchronizes notifications * against file removal/re-creation across css hiding. */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. */ static struct workqueue_struct *cgroup_destroy_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of cgroup subsystem names */ #define SUBSYS(_x) [_x ## _cgrp_id] = #_x, static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ #define SUBSYS(_x) \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, static struct static_key_true *cgroup_subsys_enabled_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, static struct static_key_true *cgroup_subsys_on_dfl_key[] = { #include <linux/cgroup_subsys.h> }; #undef SUBSYS static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ static bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; /* some controllers are implicitly enabled on the default hierarchy */ static u16 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ static u16 cgrp_dfl_threaded_ss_mask; /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; /* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); /* * Assign a monotonically increasing serial number to csses. It guarantees * cgroups with bigger numbers are newer than those with smaller numbers. * Also, as csses are always appended to the parent's ->children list, it * guarantees that sibling csses are always sorted in the ascending serial * number order on the list. Protected by cgroup_mutex. */ static u64 css_serial_nr_next = 1; /* * These bitmasks identify subsystems with specific features to avoid * having to do iterative checks repeatedly. */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, .root_cset = &init_css_set, }; static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest * * cgroup_subsys_enabled() can only be used with literal subsys names which * is fine for individual subsystems but unsuitable for cgroup core. This * is slower static_key_enabled() based test indexed by @ssid. */ bool cgroup_ssid_enabled(int ssid) { if (CGROUP_SUBSYS_COUNT == 0) return false; return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for * cases where a subsystem should behave differnetly depending on the * interface version. * * The set of behaviors which change on the default hierarchy are still * being determined and the mount option is prefixed with __DEVEL__. * * List of changed behaviors: * * - Mount options "noprefix", "xattr", "clone_children", "release_agent" * and "name" are disallowed. * * - When mounting an existing superblock, mount options should match. * * - Remount is disallowed. * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got * recycled inbetween reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. * * - "cgroup.clone_children" is removed. * * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup * and its descendants contain no task; otherwise, 1. The file also * generates kernfs notification which can be monitored through poll and * [di]notify when the value of the file changes. * * - cpuset: tasks will be kept in empty cpusets when hotplug happens and * take masks of ancestors with non-empty cpus/mems, instead of being * moved to an ancestor. * * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * * - memcg: use_hierarchy is on by default and the cgroup file for the flag * is not created. * * - blkcg: blk-throttle becomes properly hierarchical. * * - debug: disallowed on the default hierarchy. */ bool cgroup_on_dfl(const struct cgroup *cgrp) { return cgrp->root == &cgrp_dfl_root; } /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) { int ret; idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; } static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) { void *ret; spin_lock_bh(&cgroup_idr_lock); ret = idr_replace(idr, ptr, id); spin_unlock_bh(&cgroup_idr_lock); return ret; } static void cgroup_idr_remove(struct idr *idr, int id) { spin_lock_bh(&cgroup_idr_lock); idr_remove(idr, id); spin_unlock_bh(&cgroup_idr_lock); } static bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; } bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; } /* can @cgrp host both domain and threaded children? */ static bool cgroup_is_mixable(struct cgroup *cgrp) { /* * Root isn't under domain level resource control exempting it from * the no-internal-process constraint, so it can serve as a thread * root and a parent of resource domains at the same time. */ return !cgroup_parent(cgrp); } /* can @cgrp become a thread root? should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return true; /* domain roots can't be nested under threaded */ if (cgroup_is_threaded(cgrp)) return false; /* can only have either domain or threaded children */ if (cgrp->nr_populated_domain_children) return false; /* and no domain controllers can be enabled */ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return false; return true; } /* is @cgrp root of a threaded subtree? */ bool cgroup_is_thread_root(struct cgroup *cgrp) { /* thread root should be a domain */ if (cgroup_is_threaded(cgrp)) return false; /* a domain w/ threaded children is a thread root */ if (cgrp->nr_threaded_children) return true; /* * A domain which has tasks and explicit threaded controllers * enabled is a thread root. */ if (cgroup_has_tasks(cgrp) && (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask)) return true; return false; } /* a domain which isn't connected to the root w/o brekage can't be used */ static bool cgroup_is_valid_domain(struct cgroup *cgrp) { /* the cgroup itself can be a thread root */ if (cgroup_is_threaded(cgrp)) return false; /* but the ancestors can't be unless mixable */ while ((cgrp = cgroup_parent(cgrp))) { if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp)) return false; if (cgroup_is_threaded(cgrp)) return false; } return true; } /* subsystems visibly enabled on a cgroup */ static u16 cgroup_control(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); u16 root_ss_mask = cgrp->root->subsys_mask; if (parent) { u16 ss_mask = parent->subtree_control; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } if (cgroup_on_dfl(cgrp)) root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | cgrp_dfl_implicit_ss_mask); return root_ss_mask; } /* subsystems enabled on a cgroup */ static u16 cgroup_ss_mask(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); if (parent) { u16 ss_mask = parent->subtree_ss_mask; /* threaded cgroups can only have threaded controllers */ if (cgroup_is_threaded(cgrp)) ss_mask &= cgrp_dfl_threaded_ss_mask; return ss_mask; } return cgrp->root->subsys_mask; } /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This * function must be called either under cgroup_mutex or rcu_read_lock() and * the caller is responsible for pinning the returned css if it wants to * keep accessing it outside the said locks. This function may return * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (ss) return rcu_dereference_check(cgrp->subsys[ss->id], lockdep_is_held(&cgroup_mutex)); else return &cgrp->self; } /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist * or is offline, %NULL is returned. */ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; rcu_read_lock(); css = cgroup_css(cgrp, ss); if (!css || !css_tryget_online(css)) css = NULL; rcu_read_unlock(); return css; } /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) * * Similar to cgroup_css() but returns the effective css, which is defined * as the matching css of the nearest ancestor including self which has @ss * enabled. If @ss is associated with the hierarchy @cgrp is on, this * function is guaranteed to return non-NULL css. */ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { lockdep_assert_held(&cgroup_mutex); if (!ss) return &cgrp->self; /* * This function is used while updating css associations and thus * can't test the csses directly. Test ss_mask. */ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); if (!cgrp) return NULL; } return cgroup_css(cgrp, ss); } /** * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest * @ss: the subsystem of interest * * Find and get the effective css of @cgrp for @ss. The effective css is * defined as the matching css of the nearest ancestor including self which * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, * the root css is returned, so this function always returns a valid css. * The returned css must be put using css_put(). */ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; rcu_read_lock(); do { css = cgroup_css(cgrp, ss); if (css && css_tryget_online(css)) goto out_unlock; cgrp = cgroup_parent(cgrp); } while (cgrp); css = init_css_set.subsys[ss->id]; css_get(css); out_unlock: rcu_read_unlock(); return css; } static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); /* * This is open and unprotected implementation of cgroup_css(). * seq_css() is only called from a kernfs file operation which has * an active reference on the file. Because all the subsystem * files are drained before a css is disassociated with a cgroup, * the matching css from the cgroup's subsys table is guaranteed to * be and stay valid until the enclosing operation is complete. */ if (cft->ss) return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); else return &cgrp->self; } EXPORT_SYMBOL_GPL(of_css); /** * for_each_css - iterate all css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_[tree_]mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** * for_each_e_css - iterate all effective css's of a cgroup * @css: the iteration cursor * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_[tree_]mutex. */ #define for_each_e_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ ; \ else /** * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of * @ss_mask is set. */ #define do_each_subsys_mask(ss, ssid, ss_mask) do { \ unsigned long __ss_mask = (ss_mask); \ if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ (ssid) = 0; \ break; \ } \ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ (ss) = cgroup_subsys[ssid]; \ { #define while_each_subsys_mask() \ } \ } \ } while (false) /* iterate over child cgrps, lock should be held throughout iteration */ #define cgroup_for_each_live_child(child, cgrp) \ list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ if (({ lockdep_assert_held(&cgroup_mutex); \ cgroup_is_dead(child); })) \ ; \ else /* walk live descendants in preorder */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* walk live descendants in postorder */ #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ (dsct) = (d_css)->cgroup; \ cgroup_is_dead(dsct); })) \ ; \ else /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ struct css_set init_css_set = { .refcount = REFCOUNT_INIT(1), .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* * The following field is re-initialized when this cset gets linked * in cgroup_init(). However, let's initialize the field * statically too so that the default cgroup can be accessed safely * early during boot. */ .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ static bool css_set_threaded(struct css_set *cset) { return cset->dom_cset != cset; } /** * css_set_populated - does a css_set contain any tasks? * @cset: target css_set * * css_set_populated() should be the same as !!cset->nr_tasks at steady * state. However, css_set_populated() can be called while a task is being * added to or removed from the linked list before the nr_tasks is * properly updated. Hence, we can't just look at ->nr_tasks here. */ static bool css_set_populated(struct css_set *cset) { lockdep_assert_held(&css_set_lock); return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); } /** * cgroup_update_populated - update the populated count of a cgroup * @cgrp: the target cgroup * @populated: inc or dec populated count * * One of the css_sets associated with @cgrp is either getting its first * task or losing the last. Update @cgrp->nr_populated_* accordingly. The * count is propagated towards root so that a given cgroup's * nr_populated_children is zero iff none of its descendants contain any * tasks. * * @cgrp's interface file "cgroup.populated" is zero if both * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and * 1 otherwise. When the sum changes from or to zero, userland is notified * that the content of the interface file has changed. This can be used to * detect when @cgrp and its descendants become populated or empty. */ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) { struct cgroup *child = NULL; int adj = populated ? 1 : -1; lockdep_assert_held(&css_set_lock); do { bool was_populated = cgroup_is_populated(cgrp); if (!child) { cgrp->nr_populated_csets += adj; } else { if (cgroup_is_threaded(child)) cgrp->nr_populated_threaded_children += adj; else cgrp->nr_populated_domain_children += adj; } if (was_populated == cgroup_is_populated(cgrp)) break; cgroup1_check_for_release(cgrp); cgroup_file_notify(&cgrp->events_file); child = cgrp; cgrp = cgroup_parent(cgrp); } while (cgrp); } /** * css_set_update_populated - update populated state of a css_set * @cset: target css_set * @populated: whether @cset is populated or depopulated * * @cset is either getting the first task or losing the last. Update the * populated counters of all associated cgroups accordingly. */ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) cgroup_update_populated(link->cgrp, populated); } /* * @task is leaving, advance task iterators which are pointing to it so * that they can resume at the next position. Advancing an iterator might * remove it from the list, use safe walk. See css_task_iter_skip() for * details. */ static void css_set_skip_task_iters(struct css_set *cset, struct task_struct *task) { struct css_task_iter *it, *pos; list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) css_task_iter_skip(it, task); } /** * css_set_move_task - move a task from one css_set to another * @task: task being moved * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated * instead of moved, @to_cset can be NULL. * * This function automatically handles populated counter updates and * css_task_iter adjustments but the caller is responsible for managing * @from_cset and @to_cset's reference counts. */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, bool use_mg_tasks) { lockdep_assert_held(&css_set_lock); if (to_cset && !css_set_populated(to_cset)) css_set_update_populated(to_cset, true); if (from_cset) { WARN_ON_ONCE(list_empty(&task->cg_list)); css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); } else { WARN_ON_ONCE(!list_empty(&task->cg_list)); } if (to_cset) { /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race * against cgroup_exit() changing the css_set to * init_css_set and dropping the old one. */ WARN_ON_ONCE(task->flags & PF_EXITING); rcu_assign_pointer(task->cgroups, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } } /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into * account cgroups in empty hierarchies. */ #define CSS_SET_HASH_BITS 7 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) { unsigned long key = 0UL; struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) key += (unsigned long)css[i]; key = (key >> 16) ^ key; return key; } void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&css_set_lock); if (!refcount_dec_and_test(&cset->refcount)) return; WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); /* This css_set is dead. unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); } hash_del(&cset->hlist); css_set_count--; list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); if (cgroup_parent(link->cgrp)) cgroup_put(link->cgrp); kfree(link); } if (css_set_threaded(cset)) { list_del(&cset->threaded_csets_node); put_css_set_locked(cset->dom_cset); } kfree_rcu(cset, rcu_head); } /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested * @old_cset: existing css_set for a task * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, struct css_set *old_cset, struct cgroup *new_cgrp, struct cgroup_subsys_state *template[]) { struct cgroup *new_dfl_cgrp; struct list_head *l1, *l2; /* * On the default hierarchy, there can be csets which are * associated with the same set of cgroups but different csses. * Let's first ensure that csses match. */ if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; /* @cset's domain should match the default cgroup's */ if (cgroup_on_dfl(new_cgrp)) new_dfl_cgrp = new_cgrp; else new_dfl_cgrp = old_cset->dfl_cgrp; if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp) return false; /* * Compare cgroup pointers in order to distinguish between * different cgroups in hierarchies. As different cgroups may * share the same effective css, this comparison is always * necessary. */ l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { struct cgrp_cset_link *link1, *link2; struct cgroup *cgrp1, *cgrp2; l1 = l1->next; l2 = l2->next; /* See if we reached the end - both lists are equal length. */ if (l1 == &cset->cgrp_links) { BUG_ON(l2 != &old_cset->cgrp_links); break; } else { BUG_ON(l2 == &old_cset->cgrp_links); } /* Locate the cgroups associated with these links. */ link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link); link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link); cgrp1 = link1->cgrp; cgrp2 = link2->cgrp; /* Hierarchies should be linked in the same order. */ BUG_ON(cgrp1->root != cgrp2->root); /* * If this hierarchy is the hierarchy of the cgroup * that's changing, then we need to check that this * css_set points to the new cgroup; if it's any other * hierarchy, then this css_set should point to the * same cgroup as the old css_set. */ if (cgrp1->root == new_cgrp->root) { if (cgrp1 != new_cgrp) return false; } else { if (cgrp1 != cgrp2) return false; } } return true; } /** * find_existing_css_set - init css array and find the matching css_set * @old_cset: the css_set that we're using before the cgroup transition * @cgrp: the cgroup that we're moving into * @template: out param for the new set of csses, should be clear on entry */ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state *template[]) { struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; int i; /* * Build the set of subsystem state objects that we want to see in the * new css_set. while subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp. */ template[i] = cgroup_e_css(cgrp, ss); } else { /* * @ss is not in this hierarchy, so we don't want * to change the css. */ template[i] = old_cset->subsys[i]; } } key = css_set_hash(template); hash_for_each_possible(css_set_table, cset, hlist, key) { if (!compare_css_sets(cset, old_cset, cgrp, template)) continue; /* This css_set matches what we need */ return cset; } /* No existing cgroup group matched */ return NULL; } static void free_cgrp_cset_links(struct list_head *links_to_free) { struct cgrp_cset_link *link, *tmp_link; list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) { list_del(&link->cset_link); kfree(link); } } /** * allocate_cgrp_cset_links - allocate cgrp_cset_links * @count: the number of links to allocate * @tmp_links: list_head the allocated links are put on * * Allocate @count cgrp_cset_link structures and chain them on @tmp_links * through ->cset_link. Returns 0 on success or -errno. */ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) { struct cgrp_cset_link *link; int i; INIT_LIST_HEAD(tmp_links); for (i = 0; i < count; i++) { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { free_cgrp_cset_links(tmp_links); return -ENOMEM; } list_add(&link->cset_link, tmp_links); } return 0; } /** * link_css_set - a helper function to link a css_set to a cgroup * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links() * @cset: the css_set to be linked * @cgrp: the destination cgroup */ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp) { struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); if (cgroup_on_dfl(cgrp)) cset->dfl_cgrp = cgrp; link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; /* * Always add links to the tail of the lists so that the lists are * in choronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) cgroup_get_live(cgrp); } /** * find_css_set - return a new css_set with one cgroup updated * @old_cset: the baseline css_set * @cgrp: the cgroup to be updated * * Return a new css_set that's equivalent to @old_cset, but with @cgrp * substituted into the appropriate hierarchy. */ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup *cgrp) { struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { }; struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; struct cgroup_subsys *ss; unsigned long key; int ssid; lockdep_assert_held(&cgroup_mutex); /* First see if we already have a cgroup group that matches * the desired set */ spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); spin_unlock_irq(&css_set_lock); if (cset) return cset; cset = kzalloc(sizeof(*cset), GFP_KERNEL); if (!cset) return NULL; /* Allocate all the cgrp_cset_link objects that we'll need */ if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) { kfree(cset); return NULL; } refcount_set(&cset->refcount, 1); cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->mg_src_preload_node); INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == cgrp->root) c = cgrp; link_css_set(&tmp_links, cset, c); } BUG_ON(!list_empty(&tmp_links)); css_set_count++; /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cset->subsys[ssid]; list_add_tail(&cset->e_cset_node[ssid], &css->cgroup->e_csets[ssid]); css_get(css); } spin_unlock_irq(&css_set_lock); /* * If @cset should be threaded, look up the matching dom_cset and * link them up. We first fully initialize @cset then look for the * dom_cset. It's simpler this way and safe as @cset is guaranteed * to stay empty until we return. */ if (cgroup_is_threaded(cset->dfl_cgrp)) { struct css_set *dcset; dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp); if (!dcset) { put_css_set(cset); return NULL; } spin_lock_irq(&css_set_lock); cset->dom_cset = dcset; list_add_tail(&cset->threaded_csets_node, &dcset->threaded_csets); spin_unlock_irq(&css_set_lock); } return cset; } struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { struct cgroup *root_cgrp = kf_root->kn->priv; return root_cgrp->root; } static int cgroup_init_root_id(struct cgroup_root *root) { int id; lockdep_assert_held(&cgroup_mutex); id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); if (id < 0) return id; root->hierarchy_id = id; return 0; } static void cgroup_exit_root_id(struct cgroup_root *root) { lockdep_assert_held(&cgroup_mutex); idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); } void cgroup_free_root(struct cgroup_root *root) { if (root) { idr_destroy(&root->cgroup_idr); kfree(root); } } static void cgroup_destroy_root(struct cgroup_root *root) { struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; trace_cgroup_destroy_root(root); cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's * root cgroup */ spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); list_del(&link->cgrp_link); kfree(link); } spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); cgroup_root_count--; } cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy */ static struct cgroup * current_cgns_cgroup_from_root(struct cgroup_root *root) { struct cgroup *res = NULL; struct css_set *cset; lockdep_assert_held(&css_set_lock); rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; if (cset == &init_css_set) { res = &root->cgrp; } else { struct cgrp_cset_link *link; list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { res = c; break; } } } rcu_read_unlock(); BUG_ON(!res); return res; } /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { struct cgroup *res = NULL; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); if (cset == &init_css_set) { res = &root->cgrp; } else if (root == &cgrp_dfl_root) { res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; if (c->root == root) { res = c; break; } } } BUG_ON(!res); return res; } /* * Return the cgroup for "task" from the given hierarchy. Must be * called with cgroup_mutex and css_set_lock held. */ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* * No need to lock the task - since we hold cgroup_mutex the * task can't change groups, so the only thing that can happen * is that it exits and its css is set back to init_css_set. */ return cset_cgroup_from_root(task_css_set(task), root); } /* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to * increment the count). So code holding cgroup_mutex can safely * assume that if the count is zero, it will stay zero. Similarly, if * a task holds cgroup_mutex on a cgroup with zero count, it * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, char *buf) { struct cgroup_subsys *ss = cft->ss; if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question * * S_IRUGO for read, S_IWUSR for write. */ static umode_t cgroup_file_mode(const struct cftype *cft) { umode_t mode = 0; if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; if (cft->write_u64 || cft->write_s64 || cft->write) { if (cft->flags & CFTYPE_WORLD_WRITABLE) mode |= S_IWUGO; else mode |= S_IWUSR; } return mode; } /** * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if * @subtree_control is to be applied while restricted to @this_ss_mask. */ static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { u16 new_ss_mask = cur_ss_mask; do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; cur_ss_mask = new_ss_mask; } return cur_ss_mask; } /** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * * This helper undoes cgroup_kn_lock_live() and should be invoked before * the method finishes if locking succeeded. Note that once this function * returns the cgroup returned by cgroup_kn_lock_live() may become * inaccessible any time. If the caller intends to continue to access the * cgroup, it should pin it before invoking this function. */ void cgroup_kn_unlock(struct kernfs_node *kn) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn->parent->priv; mutex_unlock(&cgroup_mutex); kernfs_unbreak_active_protection(kn); cgroup_put(cgrp); } /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline) { struct cgroup *cgrp; if (kernfs_type(kn) == KERNFS_DIR) cgrp = kn->priv; else cgrp = kn->parent->priv; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. cgroup liveliness check alone provides enough * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ if (!cgroup_tryget(cgrp)) return NULL; kernfs_break_active_protection(kn); if (drain_offline) cgroup_lock_and_drain_offline(cgrp); else mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) return cgrp; cgroup_kn_unlock(kn); return NULL; } static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); if (cft->file_offset) { struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); del_timer_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** * css_clear_dir - remove subsys files in a cgroup directory * @css: taget css */ static void css_clear_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts; if (!(css->flags & CSS_VISIBLE)) return; css->flags &= ~CSS_VISIBLE; if (!css->ss) { if (cgroup_on_dfl(cgrp)) cfts = cgroup_base_files; else cfts = cgroup1_base_files; cgroup_addrm_files(css, cgrp, cfts, false); } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } } /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css * * On failure, no file is added. */ static int css_populate_dir(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; if ((css->flags & CSS_VISIBLE) || !cgrp->kn) return 0; if (!css->ss) { if (cgroup_on_dfl(cgrp)) cfts = cgroup_base_files; else cfts = cgroup1_base_files; ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); if (ret < 0) return ret; } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); if (ret < 0) { failed_cfts = cfts; goto err; } } } css->flags |= CSS_VISIBLE; return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { if (cfts == failed_cfts) break; cgroup_addrm_files(css, cgrp, cfts, false); } return ret; } int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; u16 dfl_disable_ss_mask = 0; lockdep_assert_held(&cgroup_mutex); do_each_subsys_mask(ss, ssid, ss_mask) { /* * If @ss has non-root csses attached to it, can't move. * If @ss is an implicit controller, it is exempt from this * rule and can be stolen. */ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; /* * Collect ssid's that need to be disabled from default * hierarchy. */ if (ss->root == &cgrp_dfl_root) dfl_disable_ss_mask |= 1 << ssid; } while_each_subsys_mask(); if (dfl_disable_ss_mask) { struct cgroup *scgrp = &cgrp_dfl_root.cgrp; /* * Controllers from default hierarchy that need to be rebound * are all disabled together in one go. */ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); struct css_set *cset; WARN_ON(!css || cgroup_css(dcgrp, ss)); if (src_root != &cgrp_dfl_root) { /* disable from the source */ src_root->subsys_mask &= ~(1 << ssid); WARN_ON(cgroup_apply_control(scgrp)); cgroup_finalize_control(scgrp, 0); } /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; css->cgroup = dcgrp; spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } ret = cgroup_apply_control(dcgrp); if (ret) pr_warn("partial failure to rebind %s controller (err=%d)\n", ss->name, ret); if (ss->bind) ss->bind(css); } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; } int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root) { int len = 0; char *buf = NULL; struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root); struct cgroup *ns_cgroup; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) return -ENOMEM; spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); spin_unlock_irq(&css_set_lock); if (len >= PATH_MAX) len = -ERANGE; else if (len > 0) { seq_escape(sf, buf, " \t\n\\"); len = 0; } kfree(buf); return len; } static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) { char *token; *root_flags = 0; if (!data || *data == '\0') return 0; while ((token = strsep(&data, ",")) != NULL) { if (!strcmp(token, "nsdelegate")) { *root_flags |= CGRP_ROOT_NS_DELEGATE; continue; } pr_err("cgroup2: unknown option \"%s\"\n", token); return -EINVAL; } return 0; } static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { if (root_flags & CGRP_ROOT_NS_DELEGATE) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; } } static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); return 0; } static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { unsigned int root_flags; int ret; ret = parse_cgroup_root_flags(data, &root_flags); if (ret) return ret; apply_cgroup_root_flags(root_flags); return 0; } /* * To reduce the fork() overhead for systems that are not actually using * their cgroups capability, we don't maintain the lists running through * each css_set to its tasks until we see the list actually used - in other * words after the first mount. */ static bool use_task_css_set_links __read_mostly; static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; /* * We need tasklist_lock because RCU is not safe against * while_each_thread(). Besides, a forking task that has passed * cgroup_post_fork() without seeing use_task_css_set_links = 1 * is not guaranteed to have its child immediately visible in the * tasklist if we walk through it with RCU. */ read_lock(&tasklist_lock); spin_lock_irq(&css_set_lock); if (use_task_css_set_links) goto out_unlock; use_task_css_set_links = true; do_each_thread(g, p) { WARN_ON_ONCE(!list_empty(&p->cg_list) || task_css_set(p) != &init_css_set); /* * We should check if the process is exiting, otherwise * it will race with cgroup_exit() in that the list * entry won't be deleted though the process has exited. * Do it while holding siglock so that we don't end up * racing against cgroup_exit(). * * Interrupts were already disabled while acquiring * the css_set_lock, so we do not need to disable it * again when acquiring the sighand->siglock here. */ spin_lock(&p->sighand->siglock); if (!(p->flags & PF_EXITING)) { struct css_set *cset = task_css_set(p); if (!css_set_populated(cset)) css_set_update_populated(cset, true); list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); cset->nr_tasks++; } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); out_unlock: spin_unlock_irq(&css_set_lock); read_unlock(&tasklist_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; int ssid; INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; cgrp->self.flags |= CSS_ONLINE; cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) { struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); root->flags = opts->flags; if (opts->release_agent) strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; struct kernfs_syscall_ops *kf_sops; struct css_set *cset; int i, ret; lockdep_assert_held(&cgroup_mutex); ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); if (ret < 0) goto out; root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, ref_flags, GFP_KERNEL); if (ret) goto out; /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding * cgroup_lock, and that's us. Later rebinding may disable * controllers on the default hierarchy and thus create new csets, * which can't be more than the existing ones. Allocate 2x. */ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; ret = cgroup_init_root_id(root); if (ret) goto cancel_ref; kf_sops = root == &cgrp_dfl_root ? &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; ret = rebind_subsystems(root, ss_mask); if (ret) goto destroy_root; ret = cgroup_bpf_inherit(root_cgrp); WARN_ON_ONCE(ret); trace_cgroup_setup_root(root); /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in * the failure exit path. */ list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; /* * Link the root cgroup in this hierarchy into all the css_set * objects. */ spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); kernfs_activate(root_cgrp->kn); ret = 0; goto out; destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; exit_root_id: cgroup_exit_root_id(root); cancel_ref: percpu_ref_exit(&root_cgrp->self.refcnt); out: free_cgrp_cset_links(&tmp_links); return ret; } struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, struct cgroup_namespace *ns) { struct dentry *dentry; bool new_sb = false; dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { struct dentry *nsdentry; struct super_block *sb = dentry->d_sb; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ns->root_cset, root); spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(dentry); if (IS_ERR(nsdentry)) deactivate_locked_super(sb); dentry = nsdentry; } if (!new_sb) cgroup_put(&root->cgrp); return dentry; } static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct dentry *dentry; int ret; get_cgroup_ns(ns); /* Check if the caller has permission to mount. */ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { put_cgroup_ns(ns); return ERR_PTR(-EPERM); } /* * The first time anyone tries to mount a cgroup, enable the list * linking each css_set to its tasks and fix up all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); if (fs_type == &cgroup2_fs_type) { unsigned int root_flags; ret = parse_cgroup_root_flags(data, &root_flags); if (ret) { put_cgroup_ns(ns); return ERR_PTR(ret); } cgrp_dfl_visible = true; cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); if (!IS_ERR(dentry)) apply_cgroup_root_flags(root_flags); } else { dentry = cgroup1_mount(&cgroup_fs_type, flags, data, CGROUP_SUPER_MAGIC, ns); } put_cgroup_ns(ns); return dentry; } static void cgroup_kill_sb(struct super_block *sb) { struct kernfs_root *kf_root = kernfs_root_from_sb(sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* * If @root doesn't have any mounts or children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ if (!list_empty(&root->cgrp.self.children) || root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else percpu_ref_kill(&root->cgrp.self.refcnt); kernfs_kill_sb(sb); } struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { .name = "cgroup2", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns) { int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return ret; } EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task * @buf: the buffer to write the path into * @buflen: the length of the buffer * * Determine @task's cgroup on the first (the one with the lowest non-zero * hierarchy_id) cgroup hierarchy and copy its path into @buf. This * function grabs cgroup_mutex and shouldn't be used inside locks used by * cgroup controller callbacks. * * Return value is the same as kernfs_path(). */ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); if (root) { cgrp = task_cgroup_from_root(task, root); ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); /** * cgroup_migrate_add_task - add a migration target task to a migration context * @task: target task * @mgctx: target migration context * * Add @task, which is a migration target, to @mgctx->tset. This function * becomes noop if @task doesn't need to be migrated. @task's css_set * should have been added as a migration source and @task->cg_list will be * moved from the css_set's tasks list to mg_tasks one. */ static void cgroup_migrate_add_task(struct task_struct *task, struct cgroup_mgctx *mgctx) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* @task either already exited or can't exit until the end */ if (task->flags & PF_EXITING) return; /* leave @task alone if post_fork() hasn't linked it yet */ if (list_empty(&task->cg_list)) return; cset = task_css_set(task); if (!cset->mg_src_cgrp) return; mgctx->tset.nr_tasks++; list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, &mgctx->tset.src_csets); if (list_empty(&cset->mg_dst_cset->mg_node)) list_add_tail(&cset->mg_dst_cset->mg_node, &mgctx->tset.dst_csets); } /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; while (&cset->mg_node != tset->csets) { if (!task) task = list_first_entry(&cset->mg_tasks, struct task_struct, cg_list); else task = list_next_entry(task, cg_list); if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; /* * This function may be called both before and * after cgroup_taskset_migrate(). The two cases * can be distinguished by looking at whether @cset * has its ->mg_dst_cset set. */ if (cset->mg_dst_cset) *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; else *dst_cssp = cset->subsys[tset->ssid]; return task; } cset = list_next_entry(cset, mg_node); task = NULL; } return NULL; } /** * cgroup_taskset_migrate - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. * This function fails iff one of the ->can_attach callbacks fails and * guarantees that either all or none of the tasks in @mgctx are migrated. * @mgctx is consumed regardless of success. */ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) { struct cgroup_taskset *tset = &mgctx->tset; struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; /* check that we can legitimately attach to the cgroup */ if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->can_attach) { tset->ssid = ssid; ret = ss->can_attach(tset); if (ret) { failed_ssid = ssid; goto out_cancel_attach; } } } while_each_subsys_mask(); } /* * Now that we're guaranteed success, proceed to move all tasks to * the new cgroup. There are no failure cases after here, so this * is the commit point. */ spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); struct css_set *to_cset = cset->mg_dst_cset; get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); put_css_set_locked(from_cset); from_cset->nr_tasks--; } } spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. * Nothing is sensitive to fork() after this point. Notify * controllers that migration is complete. */ tset->csets = &tset->dst_csets; if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ss->attach) { tset->ssid = ssid; ss->attach(tset); } } while_each_subsys_mask(); } ret = 0; goto out_release_tset; out_cancel_attach: if (tset->nr_tasks) { do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { if (ssid == failed_ssid) break; if (ss->cancel_attach) { tset->ssid = ssid; ss->cancel_attach(tset); } } while_each_subsys_mask(); } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } spin_unlock_irq(&css_set_lock); /* * Re-initialize the cgroup_taskset structure in case it is reused * again in another cgroup_migrate_add_task()/cgroup_migrate_execute() * iteration. */ tset->nr_tasks = 0; tset->csets = &tset->src_csets; return ret; } /** * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination * @dst_cgrp: destination cgroup to test * * On the default hierarchy, except for the mixable, (possible) thread root * and threaded cgroups, subtree_control must be zero for migration * destination cgroups with tasks so that child cgroups don't compete * against tasks. */ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) { /* v1 doesn't have any restriction */ if (!cgroup_on_dfl(dst_cgrp)) return 0; /* verify @dst_cgrp can host resources */ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; /* mixables don't care */ if (cgroup_is_mixable(dst_cgrp)) return 0; /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. */ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp)) return 0; /* apply no-internal-process constraint */ if (dst_cgrp->subtree_control) return -EBUSY; return 0; } /** * cgroup_migrate_finish - cleanup after attach * @mgctx: migration context * * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See * those functions for details. */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_src_preload_node); put_css_set_locked(cset); } list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } spin_unlock_irq(&css_set_lock); } /** * cgroup_migrate_add_src - add a migration source css_set * @src_cset: the source css_set to add * @dst_cgrp: the destination cgroup * @mgctx: migration context * * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin * @src_cset and add it to @mgctx->src_csets, which should later be cleaned * up by cgroup_migrate_finish(). * * This function may be called without holding cgroup_threadgroup_rwsem * even if the target is a process. Threads may be created and destroyed * but as long as cgroup_mutex is not dropped, no new css_set can be put * into play and the preloaded css_sets are guaranteed to cover all * migrations. */ void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx) { struct cgroup *src_cgrp; lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); /* * If ->dead, @src_set is associated with one or more dead cgroups * and doesn't contain any migratable tasks. Ignore it early so * that the rest of migration path doesn't get confused by it. */ if (src_cset->dead) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); if (!list_empty(&src_cset->mg_src_preload_node)) return; WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration * @mgctx: migration context * * Tasks are about to be moved and all the source css_sets have been * preloaded to @mgctx->preloaded_src_csets. This function looks up and * pins all destination css_sets, links each to its source, and append them * to @mgctx->preloaded_dst_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @mgctx. */ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) { struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) goto err; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); /* * If src cset equals dst, it's noop. Drop the src. * cgroup_migrate() will skip the cset too. Note that we * can't handle src == dst as some nodes are used by both. */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; } src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_dst_preload_node)) list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); for_each_subsys(ss, ssid) if (src_cset->subsys[ssid] != dst_cset->subsys[ssid]) mgctx->ss_mask |= 1 << ssid; } return 0; err: cgroup_migrate_finish(mgctx); return -ENOMEM; } /** * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task * @mgctx: migration context * * Migrate a process or task denoted by @leader. If migrating a process, * the caller must be holding cgroup_threadgroup_rwsem. The caller is also * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * * As long as a controller's ->can_attach() doesn't fail, this function is * guaranteed to succeed. This means that, excluding ->can_attach() * failure, when migrating multiple targets, the success or failure can be * decided for all targets by invoking group_migrate_prepare_dst() before * actually starting migrating. */ int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx) { struct task_struct *task; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); } /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @dst_cgrp: the cgroup to attach to * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. */ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup) { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; int ret; ret = cgroup_migrate_vet_dst(dst_cgrp); if (ret) return ret; /* look up all src csets */ spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); if (!threadgroup) break; } while_each_thread(leader, task); rcu_read_unlock(); spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&mgctx); if (!ret) ret = cgroup_migrate(leader, threadgroup, &mgctx); cgroup_migrate_finish(&mgctx); if (!ret) TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup); return ret; } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; pid_t pid; if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); percpu_down_write(&cgroup_threadgroup_rwsem); rcu_read_lock(); if (pid) { tsk = find_task_by_vpid(pid); if (!tsk) { tsk = ERR_PTR(-ESRCH); goto out_unlock_threadgroup; } } else { tsk = current; } if (threadgroup) tsk = tsk->group_leader; /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset, or RT kthread may be born in a * cgroup with no rt_runtime allocated. Just say no. */ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { tsk = ERR_PTR(-EINVAL); goto out_unlock_threadgroup; } get_task_struct(tsk); goto out_unlock_rcu; out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); out_unlock_rcu: rcu_read_unlock(); return tsk; } void cgroup_procs_write_finish(struct task_struct *task) __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; int ssid; /* release reference from cgroup_procs_write_start() */ put_task_struct(task); percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_printf(seq, "%s", ss->name); printed = true; } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } /* show controllers which are enabled for a given cgroup's children */ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; cgroup_print_ss_mask(seq, cgrp->subtree_control); return 0; } /** * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * * @cgrp's control masks have changed and its subtree's css associations * need to be updated accordingly. This function looks up all css_sets * which are attached to the subtree, creates the matching updated css_sets * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { DEFINE_CGROUP_MGCTX(mgctx); struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; int ret; lockdep_assert_held(&cgroup_mutex); percpu_down_write(&cgroup_threadgroup_rwsem); /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_migrate_add_task(task, &mgctx); } spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } /** * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses * @cgrp: root of the target subtree * * Because css offlining is asynchronous, userland may try to re-enable a * controller while the previous css is still around. This function grabs * cgroup_mutex and drains the previous css instances of @cgrp's subtree. */ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) __acquires(&cgroup_mutex) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; restart: mutex_lock(&cgroup_mutex); cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&cgroup_mutex); schedule(); finish_wait(&dsct->offline_waitq, &wait); cgroup_put(dsct); goto restart; } } } /** * cgroup_save_control - save control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_save_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->old_subtree_control = dsct->subtree_control; dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; dsct->old_dom_cgrp = dsct->dom_cgrp; } } /** * cgroup_propagate_control - refresh control masks of a subtree * @cgrp: root of the target subtree * * For @cgrp and its subtree, ensure ->subtree_ss_mask matches * ->subtree_control and propagate controller availability through the * subtree so that descendants don't have unavailable controllers enabled. */ static void cgroup_propagate_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { dsct->subtree_control &= cgroup_control(dsct); dsct->subtree_ss_mask = cgroup_calc_subtree_ss_mask(dsct->subtree_control, cgroup_ss_mask(dsct)); } } /** * cgroup_restore_control - restore control masks and dom_cgrp of a subtree * @cgrp: root of the target subtree * * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the * respective old_ prefixed fields for @cgrp's subtree including @cgrp * itself. */ static void cgroup_restore_control(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { dsct->subtree_control = dsct->old_subtree_control; dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; dsct->dom_cgrp = dsct->old_dom_cgrp; } } static bool css_visible(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; if (cgroup_control(cgrp) & (1 << ss->id)) return true; if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) return false; return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; } /** * cgroup_apply_control_enable - enable or show csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and create new csses or make the existing ones * visible. A css is created invisible if it's being implicitly enabled * through dependency. An invisible css is made visible when the userland * explicitly enables it. * * Returns 0 on success, -errno on failure. On failure, csses which have * been processed already aren't cleaned up. The caller is responsible for * cleaning up with cgroup_apply_control_disable(). */ static int cgroup_apply_control_enable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid, ret; cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; if (!css) { css = css_create(dsct, ss); if (IS_ERR(css)) return PTR_ERR(css); } WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css_visible(css)) { ret = css_populate_dir(css); if (ret) return ret; } } } return 0; } /** * cgroup_apply_control_disable - kill or hide csses according to control * @cgrp: root of the target subtree * * Walk @cgrp's subtree and kill and hide csses so that they match * cgroup_ss_mask() and cgroup_visible_mask(). * * A css is hidden when the userland requests it to be disabled while other * subsystems are still depending on it. The css must not actively control * resources and be in the vanilla state if it's made visible again later. * Controllers which may be depended upon should provide ->css_reset() for * this purpose. */ static void cgroup_apply_control_disable(struct cgroup *cgrp) { struct cgroup *dsct; struct cgroup_subsys_state *d_css; struct cgroup_subsys *ss; int ssid; cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); if (!css) continue; WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) ss->css_reset(css); } } } } /** * cgroup_apply_control - apply control mask updates to the subtree * @cgrp: root of the target subtree * * subsystems can be enabled and disabled in a subtree using the following * steps. * * 1. Call cgroup_save_control() to stash the current state. * 2. Update ->subtree_control masks in the subtree as desired. * 3. Call cgroup_apply_control() to apply the changes. * 4. Optionally perform other related operations. * 5. Call cgroup_finalize_control() to finish up. * * This function implements step 3 and propagates the mask changes * throughout @cgrp's subtree, updates csses accordingly and perform * process migrations. */ static int cgroup_apply_control(struct cgroup *cgrp) { int ret; cgroup_propagate_control(cgrp); ret = cgroup_apply_control_enable(cgrp); if (ret) return ret; /* * At this point, cgroup_e_css() results reflect the new csses * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ ret = cgroup_update_dfl_csses(cgrp); if (ret) return ret; return 0; } /** * cgroup_finalize_control - finalize control mask update * @cgrp: root of the target subtree * @ret: the result of the update * * Finalize control mask update. See cgroup_apply_control() for more info. */ static void cgroup_finalize_control(struct cgroup *cgrp, int ret) { if (ret) { cgroup_restore_control(cgrp); cgroup_propagate_control(cgrp); } cgroup_apply_control_disable(cgrp); } static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable) { u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask; /* if nothing is getting enabled, nothing to worry about */ if (!enable) return 0; /* can @cgrp host any resources? */ if (!cgroup_is_valid_domain(cgrp->dom_cgrp)) return -EOPNOTSUPP; /* mixables don't care */ if (cgroup_is_mixable(cgrp)) return 0; if (domain_enable) { /* can't enable domain controllers inside a thread subtree */ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return -EOPNOTSUPP; } else { /* * Threaded controllers can handle internal competitions * and are always allowed inside a (prospective) thread * subtree. */ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp)) return 0; } /* * Controllers can't be enabled for a cgroup with tasks to avoid * child cgroups competing against tasks. */ if (cgroup_has_tasks(cgrp)) return -EBUSY; return 0; } /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; int ssid, ret; /* * Parse input - space separated list of subsystem names prefixed * with either + or -. */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { if (tok[0] == '\0') continue; do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; if (*tok == '+') { enable |= 1 << ssid; disable &= ~(1 << ssid); } else if (*tok == '-') { disable |= 1 << ssid; enable &= ~(1 << ssid); } else { return -EINVAL; } break; } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; for_each_subsys(ss, ssid) { if (enable & (1 << ssid)) { if (cgrp->subtree_control & (1 << ssid)) { enable &= ~(1 << ssid); continue; } if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); continue; } /* a child has it enabled? */ cgroup_for_each_live_child(child, cgrp) { if (child->subtree_control & (1 << ssid)) { ret = -EBUSY; goto out_unlock; } } } } if (!enable && !disable) { ret = 0; goto out_unlock; } ret = cgroup_vet_subtree_control_enable(cgrp, enable); if (ret) goto out_unlock; /* save and update control masks and prepare csses */ cgroup_save_control(cgrp); cgrp->subtree_control |= enable; cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); cgroup_finalize_control(cgrp, ret); if (ret) goto out_unlock; kernfs_activate(cgrp->kn); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } /** * cgroup_enable_threaded - make @cgrp threaded * @cgrp: the target cgroup * * Called when "threaded" is written to the cgroup.type interface file and * tries to make @cgrp threaded and join the parent's resource domain. * This function is never called on the root cgroup as cgroup.type doesn't * exist on it. */ static int cgroup_enable_threaded(struct cgroup *cgrp) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup *dom_cgrp = parent->dom_cgrp; struct cgroup *dsct; struct cgroup_subsys_state *d_css; int ret; lockdep_assert_held(&cgroup_mutex); /* noop if already threaded */ if (cgroup_is_threaded(cgrp)) return 0; /* * If @cgroup is populated or has domain controllers enabled, it * can't be switched. While the below cgroup_can_be_thread_root() * test can catch the same conditions, that's only when @parent is * not mixable, so let's check it explicitly. */ if (cgroup_is_populated(cgrp) || cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) return -EOPNOTSUPP; /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) return -EOPNOTSUPP; /* * The following shouldn't cause actual migrations and should * always succeed. */ cgroup_save_control(cgrp); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) if (dsct == cgrp || cgroup_is_threaded(dsct)) dsct->dom_cgrp = dom_cgrp; ret = cgroup_apply_control(cgrp); if (!ret) parent->nr_threaded_children++; cgroup_finalize_control(cgrp, ret); return ret; } static int cgroup_type_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; if (cgroup_is_threaded(cgrp)) seq_puts(seq, "threaded\n"); else if (!cgroup_is_valid_domain(cgrp)) seq_puts(seq, "domain invalid\n"); else if (cgroup_is_thread_root(cgrp)) seq_puts(seq, "domain threaded\n"); else seq_puts(seq, "domain\n"); return 0; } static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int ret; /* only switching to threaded mode is supported */ if (strcmp(strstrip(buf), "threaded")) return -EINVAL; /* drain dying csses before we re-apply (threaded) subtree control */ cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENOENT; /* threaded can only be enabled */ ret = cgroup_enable_threaded(cgrp); cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static int cgroup_max_descendants_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int descendants = READ_ONCE(cgrp->max_descendants); if (descendants == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", descendants); return 0; } static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; int descendants; ssize_t ret; buf = strstrip(buf); if (!strcmp(buf, "max")) { descendants = INT_MAX; } else { ret = kstrtoint(buf, 0, &descendants); if (ret) return ret; } if (descendants < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_descendants = descendants; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_max_depth_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; int depth = READ_ONCE(cgrp->max_depth); if (depth == INT_MAX) seq_puts(seq, "max\n"); else seq_printf(seq, "%d\n", depth); return 0; } static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; ssize_t ret; int depth; buf = strstrip(buf); if (!strcmp(buf, "max")) { depth = INT_MAX; } else { ret = kstrtoint(buf, 0, &depth); if (ret) return ret; } if (depth < 0) return -ERANGE; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENOENT; cgrp->max_depth = depth; cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_events_show(struct seq_file *seq, void *v) { seq_printf(seq, "populated %d\n", cgroup_is_populated(seq_css(seq)->cgroup)); return 0; } static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); return 0; } static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, struct cgroup *cgrp, int ssid) { struct cgroup_subsys *ss = cgroup_subsys[ssid]; struct cgroup_subsys_state *css; int ret; if (!ss->css_extra_stat_show) return 0; css = cgroup_tryget_css(cgrp, ss); if (!css) return 0; ret = ss->css_extra_stat_show(seq, css); css_put(css); return ret; } static int cpu_stat_show(struct seq_file *seq, void *v) { struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); #endif return ret; } static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; struct cgroup_file_ctx *ctx; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); if (ret) { put_cgroup_ns(ctx->ns); kfree(ctx); } return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of->kn->priv; struct cgroup_subsys_state *css; int ret; /* * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - * cgroup.procs and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) return cft->write(of, buf, nbytes, off); /* * kernfs guarantees that a file isn't deleted with operations in * flight, which means that the matching css is and stays alive and * doesn't need to be pinned. The RCU locking is not necessary * either. It's just for the convenience of using cgroup_css(). */ rcu_read_lock(); css = cgroup_css(cgrp, cft->ss); rcu_read_unlock(); if (cft->write_u64) { unsigned long long v; ret = kstrtoull(buf, 0, &v); if (!ret) ret = cft->write_u64(css, cft, v); } else if (cft->write_s64) { long long v; ret = kstrtoll(buf, 0, &v); if (!ret) ret = cft->write_s64(css, cft, v); } else { ret = -EINVAL; } return ret ?: nbytes; } static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { if (seq_cft(seq)->seq_stop) seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cftype *cft = seq_cft(m); struct cgroup_subsys_state *css = seq_css(m); if (cft->seq_show) return cft->seq_show(m, arg); if (cft->read_u64) seq_printf(m, "%llu\n", cft->read_u64(css, cft)); else if (cft->read_s64) seq_printf(m, "%lld\n", cft->read_s64(css, cft)); else return -EINVAL; return 0; } static struct kernfs_ops cgroup_kf_single_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .seq_show = cgroup_seqfile_show, }; static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, .seq_show = cgroup_seqfile_show, }; /* set uid and gid of cgroup dirs and files to that of the creator */ static int cgroup_kn_set_ugid(struct kernfs_node *kn) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = current_fsuid(), .ia_gid = current_fsgid(), }; if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) return 0; return kernfs_setattr(kn, &iattr); } static void cgroup_file_notify_timer(struct timer_list *timer) { cgroup_file_notify(container_of(timer, struct cgroup_file, notify_timer)); } static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; struct lock_class_key *key = NULL; int ret; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = &cft->lockdep_key; #endif kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), cgroup_file_mode(cft), GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, cft->kf_ops, cft, NULL, key); if (IS_ERR(kn)) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); if (ret) { kernfs_remove(kn); return ret; } if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); } return 0; } /** * cgroup_addrm_files - add or remove files to a cgroup directory * @css: the target css * @cgrp: the target cgroup (usually css->cgroup) * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. * For removals, this function never fails. */ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add) { struct cftype *cft, *cft_end = NULL; int ret = 0; lockdep_assert_held(&cgroup_mutex); restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { pr_warn("%s: failed to add %s, err=%d\n", __func__, cft->name, ret); cft_end = cft; is_add = false; goto restart; } } else { cgroup_rm_file(cgrp, cft); } } return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; int ret = 0; lockdep_assert_held(&cgroup_mutex); /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); if (ret) break; } if (is_add && !ret) kernfs_activate(root->kn); return ret; } static void cgroup_exit_cftypes(struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft->name[0] != '\0'; cft++) { /* free copy for custom atomic_write_len, see init_cftypes() */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) kfree(cft->kf_ops); cft->kf_ops = NULL; cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); if (cft->seq_start) kf_ops = &cgroup_kf_ops; else kf_ops = &cgroup_kf_single_ops; /* * Ugh... if @cft wants a custom max_write_len, we need to * make a copy of kf_ops to set its atomic_write_len. */ if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { cgroup_exit_cftypes(cfts); return -ENOMEM; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; } return 0; } static int cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); if (!cfts || !cfts[0].ss) return -ENOENT; list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); return 0; } /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem * @cfts: zero-length name terminated array of cftypes * * Unregister @cfts. Files described by @cfts are removed from all * existing cgroups and all future cgroups won't have them either. This * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not * registered. */ int cgroup_rm_cftypes(struct cftype *cfts) { int ret; mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); return ret; } /** * cgroup_add_cftypes - add an array of cftypes to a subsystem * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Register @cfts to @ss. Files described by @cfts are created for all * existing cgroups to which @ss is attached and all future cgroups will * have them too. This function can be called anytime whether @ss is * attached or not. * * Returns 0 on successful registration, -errno on failure. Note that this * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; if (!cgroup_ssid_enabled(ss->id)) return 0; if (!cfts || cfts[0].name[0] == '\0') return 0; ret = cgroup_init_cftypes(ss, cfts); if (ret) return ret; mutex_lock(&cgroup_mutex); list_add_tail(&cfts->node, &ss->cfts); ret = cgroup_apply_cftypes(cfts, true); if (ret) cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); return ret; } /** * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the default hierarchy. */ int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_ONLY_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * * Similar to cgroup_add_cftypes() but the added files are only used for * the legacy hierarchies. */ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; for (cft = cfts; cft && cft->name[0] != '\0'; cft++) cft->flags |= __CFTYPE_NOT_ON_DFL; return cgroup_add_cftypes(ss, cfts); } /** * cgroup_file_notify - generate a file modified event for a cgroup_file * @cfile: target cgroup_file * * @cfile must have been obtained by setting cftype->file_offset. */ void cgroup_file_notify(struct cgroup_file *cfile) { unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); if (cfile->kn) { unsigned long last = cfile->notified_at; unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; if (time_in_range(jiffies, last, next)) { timer_reduce(&cfile->notify_timer, next); } else { kernfs_notify(cfile->kn); cfile->notified_at = jiffies; } } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk * * This function returns the next child of @parent and should be called * under either cgroup_mutex or RCU read lock. The only requirement is * that @parent and @pos are accessible. The next sibling is guaranteed to * be returned regardless of their states. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been unlinked from the sibling list. * Once a cgroup is removed, its ->sibling.next is no longer * updated when its next sibling changes. CSS_RELEASED is set when * @pos is taken off list, at which time its next pointer is valid, * and, as releases are serialized, the one pointed to by the next * pointer is guaranteed to not have started release yet. This * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we * have dropped rcu_read_lock() inbetween iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically * increasing unique serial number and always appended to the * sibling list, the next one can be found by walking the parent's * children until the first css with higher serial number than * @pos's. While this path can be slower, it happens iff iteration * races against release and the race window is very small. */ if (!pos) { next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { list_for_each_entry_rcu(next, &parent->children, sibling) if (next->serial_nr > pos->serial_nr) break; } /* * @next, if not pointing to the head, can be dereferenced and is * the next sibling. */ if (&next->sibling != &parent->children) return next; return NULL; } /** * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ while (pos != root) { next = css_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; } return NULL; } /** * css_rightmost_descendant - return the rightmost descendant of a css * @pos: css of interest * * Return the rightmost descendant of @pos. If there's no descendant, @pos * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct rightmost descendant as * long as @pos is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; cgroup_assert_mutex_or_rcu_locked(); do { last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last; do { last = pos; pos = css_next_child(NULL, pos); } while (pos); return last; } /** * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical * section. This function will return the correct next descendant as long * as both @pos and @cgroup are accessible and @pos is a descendant of * @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *root) { struct cgroup_subsys_state *next; cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, pos->parent); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ return pos->parent; } /** * css_has_online_children - does a css have online children * @css: the target css * * Returns %true if @css has any online children; otherwise, %false. This * function can be called from any context but the caller is responsible * for synchronizing against on/offlining as necessary. */ bool css_has_online_children(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *child; bool ret = false; rcu_read_lock(); css_for_each_child(child, css) { if (child->flags & CSS_ONLINE) { ret = true; break; } } rcu_read_unlock(); return ret; } static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) { struct list_head *l; struct cgrp_cset_link *link; struct css_set *cset; lockdep_assert_held(&css_set_lock); /* find the next threaded cset */ if (it->tcset_pos) { l = it->tcset_pos->next; if (l != it->tcset_head) { it->tcset_pos = l; return container_of(l, struct css_set, threaded_csets_node); } it->tcset_pos = NULL; } /* find the next cset */ l = it->cset_pos; l = l->next; if (l == it->cset_head) { it->cset_pos = NULL; return NULL; } if (it->ss) { cset = container_of(l, struct css_set, e_cset_node[it->ss->id]); } else { link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } it->cset_pos = l; /* initialize threaded css_set walking */ if (it->flags & CSS_TASK_ITER_THREADED) { if (it->cur_dcset) put_css_set_locked(it->cur_dcset); it->cur_dcset = cset; get_css_set(cset); it->tcset_head = &cset->threaded_csets; it->tcset_pos = &cset->threaded_csets; } return cset; } /** * css_task_iter_advance_css_set - advance a task itererator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ static void css_task_iter_advance_css_set(struct css_task_iter *it) { struct css_set *cset; lockdep_assert_held(&css_set_lock); /* Advance to the next non-empty css_set */ do { cset = css_task_iter_next_css_set(it); if (!cset) { it->task_pos = NULL; return; } } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) { it->task_pos = cset->tasks.next; it->cur_tasks_head = &cset->tasks; } else if (!list_empty(&cset->mg_tasks)) { it->task_pos = cset->mg_tasks.next; it->cur_tasks_head = &cset->mg_tasks; } else { it->task_pos = cset->dying_tasks.next; it->cur_tasks_head = &cset->dying_tasks; } it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus * need to take steps to ensure that iteration can be resumed after * the lock is re-acquired. Iteration is performed at two levels - * css_sets and tasks in them. * * Once created, a css_set never leaves its cgroup lists, so a * pinned css_set is guaranteed to stay put and we can resume * iteration afterwards. * * Tasks may leave @cset across iteration steps. This is resolved * by registering each iterator with the css_set currently being * walked and making css_set_move_task() advance iterators whose * next task is leaving. */ if (it->cur_cset) { list_del(&it->iters_node); put_css_set_locked(it->cur_cset); } get_css_set(cset); it->cur_cset = cset; list_add(&it->iters_node, &cset->task_iters); } static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task) { lockdep_assert_held(&css_set_lock); if (it->task_pos == &task->cg_list) { it->task_pos = it->task_pos->next; it->flags |= CSS_TASK_ITER_SKIPPED; } } static void css_task_iter_advance(struct css_task_iter *it) { struct task_struct *task; lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { /* * Advance iterator to find next entry. cset->tasks is * consumed first and then ->mg_tasks. After ->mg_tasks, * we move onto the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; if (it->task_pos == it->tasks_head) { it->task_pos = it->mg_tasks_head->next; it->cur_tasks_head = it->mg_tasks_head; } if (it->task_pos == it->mg_tasks_head) { it->task_pos = it->dying_tasks_head->next; it->cur_tasks_head = it->dying_tasks_head; } if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } if (!it->task_pos) return; task = list_entry(it->task_pos, struct task_struct, cg_list); if (it->flags & CSS_TASK_ITER_PROCS) { /* if PROCS, skip over tasks which aren't group leaders */ if (!thread_group_leader(task)) goto repeat; /* and dying leaders w/o live member threads */ if (it->cur_tasks_head == it->dying_tasks_head && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ if (it->cur_tasks_head == it->dying_tasks_head) goto repeat; } } /** * css_task_iter_start - initiate task iteration * @css: the css to walk tasks of * @flags: CSS_TASK_ITER_* flags * @it: the task iterator to use * * Initiate iteration through the tasks of @css. The caller can call * css_task_iter_next() to walk through the tasks until the function * returns NULL. On completion of iteration, css_task_iter_end() must be * called. */ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { /* no one should try to iterate before mounting cgroups */ WARN_ON_ONCE(!use_task_css_set_links); memset(it, 0, sizeof(*it)); spin_lock_irq(&css_set_lock); it->ss = css->ss; it->flags = flags; if (it->ss) it->cset_pos = &css->cgroup->e_csets[css->ss->id]; else it->cset_pos = &css->cgroup->cset_links; it->cset_head = it->cset_pos; css_task_iter_advance(it); spin_unlock_irq(&css_set_lock); } /** * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via css_task_iter_start(). Returns NULL when the iteration * reaches the end. */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { if (it->cur_task) { put_task_struct(it->cur_task); it->cur_task = NULL; } spin_lock_irq(&css_set_lock); /* @it may be half-advanced by skips, finish advancing */ if (it->flags & CSS_TASK_ITER_SKIPPED) css_task_iter_advance(it); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); get_task_struct(it->cur_task); css_task_iter_advance(it); } spin_unlock_irq(&css_set_lock); return it->cur_task; } /** * css_task_iter_end - finish task iteration * @it: the task iterator to finish * * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) { if (it->cur_cset) { spin_lock_irq(&css_set_lock); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); spin_unlock_irq(&css_set_lock); } if (it->cur_dcset) put_css_set(it->cur_dcset); if (it->cur_task) put_task_struct(it->cur_task); } static void cgroup_procs_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; if (ctx->procs.started) css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, unsigned int iter_flags) { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_file_ctx *ctx = of->priv; struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); css_task_iter_start(&cgrp->self, iter_flags, it); ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); } else return it->cur_task; return cgroup_procs_next(s, NULL, NULL); } static void *cgroup_procs_start(struct seq_file *s, loff_t *pos) { struct cgroup *cgrp = seq_css(s)->cgroup; /* * All processes of a threaded subtree belong to the domain cgroup * of the subtree. Only threads can be distributed across the * subtree. Reject reads on cgroup.procs in the subtree proper. * They're always empty anyway. */ if (cgroup_is_threaded(cgrp)) return ERR_PTR(-EOPNOTSUPP); return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED); } static int cgroup_procs_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", task_pid_vnr(v)); return 0; } static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb, struct cgroup_namespace *ns) { struct cgroup *com_cgrp = src_cgrp; struct inode *inode; int ret; lockdep_assert_held(&cgroup_mutex); /* find the common ancestor */ while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); if (!inode) return -ENOMEM; ret = inode_permission(inode, MAY_WRITE); iput(inode); if (ret) return ret; /* * If namespaces are delegation boundaries, %current must be able * to see both source and destination cgroups from its namespace. */ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) || !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp))) return -ENOENT; return 0; } static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; const struct cred *saved_cred; ssize_t ret; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, true); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* find the source cgroup */ spin_lock_irq(&css_set_lock); src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); /* * Process and thread migrations follow same delegation rule. Check * permissions using the credentials from file open to protect against * inherited fd attacks. */ saved_cred = override_creds(of->file->f_cred); ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, ctx->ns); revert_creds(saved_cred); if (ret) goto out_finish; ret = cgroup_attach_task(dst_cgrp, task, true); out_finish: cgroup_procs_write_finish(task); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static void *cgroup_threads_start(struct seq_file *s, loff_t *pos) { return __cgroup_procs_start(s, pos, 0); } static ssize_t cgroup_threads_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; const struct cred *saved_cred; ssize_t ret; buf = strstrip(buf); dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, false); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* find the source cgroup */ spin_lock_irq(&css_set_lock); src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); /* * Process and thread migrations follow same delegation rule. Check * permissions using the credentials from file open to protect against * inherited fd attacks. */ saved_cred = override_creds(of->file->f_cred); ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, ctx->ns); revert_creds(saved_cred); if (ret) goto out_finish; /* and must be contained in the same domain */ ret = -EOPNOTSUPP; if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) goto out_finish; ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: cgroup_procs_write_finish(task); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } /* cgroup core interface files for the default hierarchy */ static struct cftype cgroup_base_files[] = { { .name = "cgroup.type", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_type_show, .write = cgroup_type_write, }, { .name = "cgroup.procs", .flags = CFTYPE_NS_DELEGATABLE, .file_offset = offsetof(struct cgroup, procs_file), .release = cgroup_procs_release, .seq_start = cgroup_procs_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_procs_write, }, { .name = "cgroup.threads", .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, .seq_show = cgroup_procs_show, .write = cgroup_threads_write, }, { .name = "cgroup.controllers", .seq_show = cgroup_controllers_show, }, { .name = "cgroup.subtree_control", .flags = CFTYPE_NS_DELEGATABLE, .seq_show = cgroup_subtree_control_show, .write = cgroup_subtree_control_write, }, { .name = "cgroup.events", .flags = CFTYPE_NOT_ON_ROOT, .file_offset = offsetof(struct cgroup, events_file), .seq_show = cgroup_events_show, }, { .name = "cgroup.max.descendants", .seq_show = cgroup_max_descendants_show, .write = cgroup_max_descendants_write, }, { .name = "cgroup.max.depth", .seq_show = cgroup_max_depth_show, .write = cgroup_max_depth_write, }, { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, { .name = "cpu.stat", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, }, { } /* terminate */ }; /* * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. * Implemented in kill_css(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be * offlined by invoking offline_css(). After offlining, the base ref is * put. Implemented in css_killed_work_fn(). * * 3. When the percpu_ref reaches zero, the only possible remaining * accessors are inside RCU read sections. css_release() schedules the * RCU callback. * * 4. After the grace period, the css can be freed. Implemented in * css_free_work_fn(). * * It is actually hairier because both step 2 and 4 require process context * and thus involve punting to css->destroy_work adding two additional * steps to the already complex sequence. */ static void css_free_rwork_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(to_rcu_work(work), struct cgroup_subsys_state, destroy_rwork); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); if (ss) { /* css free path */ struct cgroup_subsys_state *parent = css->parent; int id = css->id; ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); if (parent) css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { /* * We get a ref to the parent, and put the ref when * this cgroup is being freed, so it's guaranteed * that the parent won't be destroyed before its * children. */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); if (cgroup_on_dfl(cgrp)) cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* * This is root cgroup's refcnt reaching zero, * which indicates that the root should be * released. */ cgroup_destroy_root(cgrp->root); } } } static void css_release_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; mutex_lock(&cgroup_mutex); css->flags |= CSS_RELEASED; list_del_rcu(&css->sibling); if (ss) { /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); list_del_rcu(&css->rstat_css_node); } cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); } else { struct cgroup *tcgrp; /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - * cgroupstats_build() and css_tryget_online_from_dir(). * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { lockdep_assert_held(&cgroup_mutex); cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); css_get(css->parent); } if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); if (css->parent) atomic_inc(&css->parent->online_cnt); } return ret; } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) return; if (ss->css_offline) ss->css_offline(css); css->flags &= ~CSS_ONLINE; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); } /** * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css * * Create a new css associated with @cgrp - @ss pair. On success, the new * css is online and installed in @cgrp. This function doesn't create the * interface files. Returns 0 on success, -errno on failure. */ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); struct cgroup_subsys_state *css; int err; lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(parent_css); if (!css) css = ERR_PTR(-ENOMEM); if (IS_ERR(css)) return css; init_and_link_css(css, ss, cgrp); err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) goto err_free_css; css->id = err; /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); err = online_css(css); if (err) goto err_list_del; if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && cgroup_parent(parent)) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); if (!strcmp(ss->name, "memory")) pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); ss->warned_broken_hierarchy = true; } return css; err_list_del: list_del_rcu(&css->sibling); err_free_css: list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } /* * The returned cgroup is fully initialized including its control mask, but * it isn't associated with its kernfs_node and doesn't have the control * mask applied. */ static struct cgroup *cgroup_create(struct cgroup *parent) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; int level = parent->level + 1; int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; if (cgroup_on_dfl(parent)) { ret = cgroup_rstat_init(cgrp); if (ret) goto out_cancel_ref; } /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. */ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; goto out_stat_exit; } init_cgroup_housekeeping(cgrp); cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; ret = cgroup_bpf_inherit(cgrp); if (ret) goto out_idr_free; spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (tcgrp != cgrp) tcgrp->nr_descendants++; } spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); cgroup_get_live(parent); /* * @cgrp is now fully operational. If something fails after this * point, it'll be released via the normal destruction path. */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); cgroup_propagate_control(cgrp); return cgrp; out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: if (cgroup_on_dfl(parent)) cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: kfree(cgrp); return ERR_PTR(ret); } static bool cgroup_check_hierarchy_limits(struct cgroup *parent) { struct cgroup *cgroup; int ret = false; int level = 1; lockdep_assert_held(&cgroup_mutex); for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) { if (cgroup->nr_descendants >= cgroup->max_descendants) goto fail; if (level > cgroup->max_depth) goto fail; level++; } ret = true; fail: return ret; } int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; struct kernfs_node *kn; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(name, '\n')) return -EINVAL; parent = cgroup_kn_lock_live(parent_kn, false); if (!parent) return -ENODEV; if (!cgroup_check_hierarchy_limits(parent)) { ret = -EAGAIN; goto out_unlock; } cgrp = cgroup_create(parent); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } /* create the directory */ kn = kernfs_create_dir(parent->kn, name, mode, cgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); goto out_destroy; } cgrp->kn = kn; /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(kn); ret = cgroup_kn_set_ugid(kn); if (ret) goto out_destroy; ret = css_populate_dir(&cgrp->self); if (ret) goto out_destroy; ret = cgroup_apply_control_enable(cgrp); if (ret) goto out_destroy; TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ kernfs_activate(kn); ret = 0; goto out_unlock; out_destroy: cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; } /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to * initate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); do { offline_css(css); css_put(css); /* @css can't go away while we're holding cgroup_mutex */ css = css->parent; } while (css && atomic_dec_and_test(&css->online_cnt)); mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } } /** * kill_css - destroy a css * @css: css to destroy * * This function initiates destruction of @css by removing cgroup interface * files and putting its base reference. ->css_offline() will be invoked * asynchronously once css_tryget_online() is guaranteed to fail and when * the reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_mutex); if (css->flags & CSS_DYING) return; css->flags |= CSS_DYING; /* * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). */ css_get(css); /* * cgroup core guarantees that, by the time ->css_offline() is * invoked, no new css reference will be given out via * css_tryget_online(). We can't simply call percpu_ref_kill() and * proceed to offlining css's because percpu_ref_kill() doesn't * guarantee that the ref is seen as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. */ percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * * css's make use of percpu refcnts whose killing latency shouldn't be * exposed to userland and are RCU protected. Also, cgroup core needs to * guarantee that css_tryget_online() won't succeed by the time * ->css_offline() is invoked. To satisfy all the requirements, * destruction is implemented in the following two steps. * * s1. Verify @cgrp can be destroyed and mark it dying. Remove all * userland visible parts and start killing the percpu refcnts of * css's. Set up so that the next stage will be kicked off once all * the percpu refcnts are confirmed to be killed. * * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the * rest of destruction. Once all cgroup references are gone, the * cgroup is RCU-freed. * * This function implements s1. After this step, @cgrp is gone as far as * the userland is concerned and a new cgroup with the same name may be * created. As cgroup doesn't care about the names internally, this * doesn't cause any problem. */ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); /* * Only migration can raise populated from zero and we're already * holding cgroup_mutex. */ if (cgroup_is_populated(cgrp)) return -EBUSY; /* * Make sure there's no live children. We can't test emptiness of * ->self.children as dead children linger on it while being * drained; otherwise, "rmdir parent/child parent" may fail. */ if (css_has_online_children(&cgrp->self)) return -EBUSY; /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling * cgroup_lock_live_group(). The latter makes the csets ignored by * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) link->cset->dead = true; spin_unlock_irq(&css_set_lock); /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; } spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); return 0; }; int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; int ret = 0; cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; ret = cgroup_destroy_locked(cgrp); if (!ret) TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; } static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; pr_debug("Initializing cgroup subsys %s\n", ss->name); mutex_lock(&cgroup_mutex); idr_init(&ss->css_idr); INIT_LIST_HEAD(&ss->cfts); /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); /* * Root csses are never destroyed and we can't initialize * percpu_ref during early init. Disable refcnting. */ css->flags |= CSS_NO_REF; if (early) { /* allocation can't be done safely during early init */ css->id = 1; } else { css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's root cgroup. */ init_css_set.subsys[ss->id] = css; have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); } /** * cgroup_init_early - cgroup initialization at system boot * * Initialize cgroups at system boot, and initialize any * subsystems that request early init. */ int __init cgroup_init_early(void) { static struct cgroup_sb_opts __initdata opts; struct cgroup_subsys *ss; int i; init_cgroup_root(&cgrp_dfl_root, &opts); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); ss->id = i; ss->name = cgroup_subsys_name[i]; if (!ss->legacy_name) ss->legacy_name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss, true); } return 0; } /** * cgroup_init - cgroup initialization * * Register cgroup filesystem and /proc file, and initialize * any subsystems that didn't request early init. */ int __init cgroup_init(void) { struct cgroup_subsys *ss; int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); cgroup_rstat_boot(); /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. */ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); /* * Add init_css_set to the hash table so that dfl_root can link to * it during init. */ hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); mutex_unlock(&cgroup_mutex); for_each_subsys(ss, ssid) { if (ss->early_init) { struct cgroup_subsys_state *css = init_css_set.subsys[ss->id]; css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); BUG_ON(css->id < 0); } else { cgroup_init_subsys(ss, false); } list_add_tail(&init_css_set.e_cset_node[ssid], &cgrp_dfl_root.cgrp.e_csets[ssid]); /* * Setting dfl_root subsys_mask needs to consider the * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ if (!cgroup_ssid_enabled(ssid)) continue; if (cgroup1_ssid_disabled(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", ss->name); cgrp_dfl_root.subsys_mask |= 1 << ss->id; /* implicit controllers must be threaded too */ WARN_ON(ss->implicit_on_dfl && !ss->threaded); if (ss->implicit_on_dfl) cgrp_dfl_implicit_ss_mask |= 1 << ss->id; else if (!ss->dfl_cftypes) cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->threaded) cgrp_dfl_threaded_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); } else { WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } if (ss->bind) ss->bind(init_css_set.subsys[ssid]); mutex_lock(&cgroup_mutex); css_populate_dir(init_css_set.subsys[ssid]); mutex_unlock(&cgroup_mutex); } /* init_css_set.subsys[] has been updated, re-hash */ hash_del(&init_css_set.hlist); hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); return 0; } static int __init cgroup_wq_init(void) { /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); return 0; } core_initcall(cgroup_wq_init); void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, char *buf, size_t buflen) { struct kernfs_node *kn; kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); kernfs_put(kn); } /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. */ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { char *buf; int retval; struct cgroup_root *root; retval = -ENOMEM; buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; int ssid, count = 0; if (root == &cgrp_dfl_root && !cgrp_dfl_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); if (root != &cgrp_dfl_root) for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->legacy_name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); /* * On traditional hierarchies, all zombie tasks show up as * belonging to the root cgroup. On the default hierarchy, * while a zombie doesn't show up in "cgroup.procs" and * thus can't be migrated, its /proc/PID/cgroup keeps * reporting the cgroup it belonged to before exiting. If * the cgroup is removed before the zombie is reaped, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); if (retval >= PATH_MAX) retval = -ENAMETOOLONG; if (retval < 0) goto out_unlock; seq_puts(m, buf); } else { seq_puts(m, "/"); } if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else seq_putc(m, '\n'); } retval = 0; out_unlock: spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); kfree(buf); out: return retval; } /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() * attaches it to the parent's css_set. Empty cg_list indicates that * @child isn't holding reference to its css_set. */ void cgroup_fork(struct task_struct *child) { RCU_INIT_POINTER(child->cgroups, &init_css_set); INIT_LIST_HEAD(&child->cg_list); } /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the task in question. * * This calls the subsystem can_fork() callbacks. If the can_fork() callback * returns an error, the fork aborts with that error code. This allows for * a cgroup subsystem to conditionally allow or deny new forks. */ int cgroup_can_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i, j, ret; do_each_subsys_mask(ss, i, have_canfork_callback) { ret = ss->can_fork(child); if (ret) goto out_revert; } while_each_subsys_mask(); return 0; out_revert: for_each_subsys(ss, j) { if (j >= i) break; if (ss->cancel_fork) ss->cancel_fork(child); } return ret; } /** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() * @child: the task in question * * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeded. */ void cgroup_cancel_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) ss->cancel_fork(child); } /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * * Adds the task to the list running through its css_set if necessary and * call the subsystem fork() callbacks. Has to be after the task is * visible on the task list in case we race with the first call to * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; /* * This may race against cgroup_enable_task_cg_lists(). As that * function sets use_task_css_set_links before grabbing * tasklist_lock and we just went through tasklist_lock to add * @child, it's guaranteed that either we see the set * use_task_css_set_links or cgroup_enable_task_cg_lists() sees * @child during its iteration. * * If we won the race, @child is associated with %current's * css_set. Grabbing css_set_lock guarantees both that the * association is stable, and, on completion of the parent's * migration, @child is visible in the source of migration or * already in the destination cgroup. This guarantee is necessary * when implementing operations which need to migrate all tasks of * a cgroup to another. * * Note that if we lose to cgroup_enable_task_cg_lists(), @child * will remain in init_css_set. This is safe because all tasks are * in the init_css_set before cg_links is enabled and there's no * operation which transfers all tasks out of init_css_set. */ if (use_task_css_set_links) { struct css_set *cset; spin_lock_irq(&css_set_lock); cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } spin_unlock_irq(&css_set_lock); } /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ do_each_subsys_mask(ss, i, have_fork_callback) { ss->fork(child); } while_each_subsys_mask(); } /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk and release it. * * Note that cgroups marked notify_on_release force every task in * them to take the global cgroup_mutex mutex when exiting. * This could impact scaling on very large systems. Be reluctant to * use notify_on_release cgroups where very high task exit scaling * is required on large systems. * * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We * call cgroup_exit() while the task is still competent to handle * notify_on_release(), then leave the task attached to the root cgroup in * each hierarchy for the remainder of its exit. No need to bother with * init_css_set refcnting. init_css_set never goes away and we can't race * with migration path - PF_EXITING is visible to migration path. */ void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; int i; /* * Unlink from @tsk from its css_set. As migration path can't race * with us, we can check css_set and cg_list without synchronization. */ cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); } /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); } while_each_subsys_mask(); } void cgroup_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); if (use_task_css_set_links) { spin_lock_irq(&css_set_lock); css_set_skip_task_iters(task_css_set(task), task); list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } } void cgroup_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); put_css_set(cset); } static int __init cgroup_disable(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; static_branch_disable(cgroup_subsys_enabled_key[i]); pr_info("Disabling %s control group subsystem\n", ss->name); } } return 1; } __setup("cgroup_disable=", cgroup_disable); /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest * @ss: subsystem of interest * * If @dentry is a directory for a cgroup which has @ss enabled on it, try * to get the corresponding css and return it. If such css doesn't exist * or can't be pinned, an ERR_PTR value is returned. */ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (cgrp) css = cgroup_css(cgrp, ss); if (!css || !css_tryget_online(css)) css = ERR_PTR(-ENOENT); rcu_read_unlock(); return css; } /** * css_from_id - lookup css by id * @id: the cgroup id * @ss: cgroup subsys to be looked into * * Returns the css if there's valid one with @id, otherwise returns NULL. * Should be called under rcu_read_lock(). */ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { WARN_ON_ONCE(!rcu_read_lock_held()); return idr_find(&ss->css_idr, id); } /** * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path * @path: path on the default hierarchy * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) * if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); if (kn) { if (kernfs_type(kn) == KERNFS_DIR) { cgrp = kn->priv; cgroup_get_live(cgrp); } else { cgrp = ERR_PTR(-ENOTDIR); } kernfs_put(kn); } else { cgrp = ERR_PTR(-ENOENT); } mutex_unlock(&cgroup_mutex); return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** * cgroup_get_from_fd - get a cgroup pointer from a fd * @fd: fd obtained by open(cgroup2_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ struct cgroup *cgroup_get_from_fd(int fd) { struct cgroup_subsys_state *css; struct cgroup *cgrp; struct file *f; f = fget_raw(fd); if (!f) return ERR_PTR(-EBADF); css = css_tryget_online_from_dir(f->f_path.dentry, NULL); fput(f); if (IS_ERR(css)) return ERR_CAST(css); cgrp = css->cgroup; if (!cgroup_on_dfl(cgrp)) { cgroup_put(cgrp); return ERR_PTR(-EBADF); } return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA #if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) DEFINE_SPINLOCK(cgroup_sk_update_lock); static bool cgroup_sk_alloc_disabled __read_mostly; void cgroup_sk_alloc_disable(void) { if (cgroup_sk_alloc_disabled) return; pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); cgroup_sk_alloc_disabled = true; } #else #define cgroup_sk_alloc_disabled false #endif void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { if (cgroup_sk_alloc_disabled) { skcd->no_refcnt = 1; return; } /* Don't associate the sock with unrelated interrupted task's cgroup. */ if (in_interrupt()) return; rcu_read_lock(); while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { skcd->val = (unsigned long)cset->dfl_cgrp; break; } cpu_relax(); } rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { /* Socket clone path */ if (skcd->val) { if (skcd->no_refcnt) return; /* * We might be cloning a socket which is left in an empty * cgroup and the cgroup might have already been rmdir'd. * Don't use cgroup_get_live(). */ cgroup_get(sock_cgroup_ptr(skcd)); } } void cgroup_sk_free(struct sock_cgroup_data *skcd) { if (skcd->no_refcnt) return; cgroup_put(sock_cgroup_ptr(skcd)); } #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags) { int ret; mutex_lock(&cgroup_mutex); ret = __cgroup_bpf_attach(cgrp, prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type, u32 flags) { int ret; mutex_lock(&cgroup_mutex); ret = __cgroup_bpf_detach(cgrp, prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, union bpf_attr __user *uattr) { int ret; mutex_lock(&cgroup_mutex); ret = __cgroup_bpf_query(cgrp, attr, uattr); mutex_unlock(&cgroup_mutex); return ret; } #endif /* CONFIG_CGROUP_BPF */ #ifdef CONFIG_SYSFS static ssize_t show_delegatable_files(struct cftype *files, char *buf, ssize_t size, const char *prefix) { struct cftype *cft; ssize_t ret = 0; for (cft = files; cft && cft->name[0] != '\0'; cft++) { if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); if (unlikely(ret >= size)) { WARN_ON(1); break; } } return ret; } static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct cgroup_subsys *ss; int ssid; ssize_t ret = 0; ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, PAGE_SIZE - ret, cgroup_subsys_name[ssid]); return ret; } static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); static struct attribute *cgroup_sysfs_attrs[] = { &cgroup_delegate_attr.attr, &cgroup_features_attr.attr, NULL, }; static const struct attribute_group cgroup_sysfs_attr_group = { .attrs = cgroup_sysfs_attrs, .name = "cgroup", }; static int __init cgroup_sysfs_init(void) { return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); } subsys_initcall(cgroup_sysfs_init); #endif /* CONFIG_SYSFS */ |
10 10 10 10 10 9 1 10 10 10 10 2 2 2 5 5 2 2 2 2 1 2 2 1 2 1 14 14 5 12 13 13 1 1 12 3 12 11 1 11 1 10 10 10 10 1 6 9 9 9 7 9 8 7 6 3 6 6 6 6 2 4 4 7 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 | /* * fs/bfs/inode.c * BFS superblock and inode operations. * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com> * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. * * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005. */ #include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/writeback.h> #include <linux/uio.h> #include <linux/uaccess.h> #include "bfs.h" MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>"); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); #undef DEBUG #ifdef DEBUG #define dprintf(x...) printf(x) #else #define dprintf(x...) #endif struct inode *bfs_iget(struct super_block *sb, unsigned long ino) { struct bfs_inode *di; struct inode *inode; struct buffer_head *bh; int block, off; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino); goto error; } block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; bh = sb_bread(inode->i_sb, block); if (!bh) { printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino); goto error; } off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; di = (struct bfs_inode *)bh->b_data + off; inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { inode->i_mode |= S_IFDIR; inode->i_op = &bfs_dir_inops; inode->i_fop = &bfs_dir_operations; } else if (le32_to_cpu(di->i_vtype) == BFS_VREG) { inode->i_mode |= S_IFREG; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; } BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock); BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); i_uid_write(inode, le32_to_cpu(di->i_uid)); i_gid_write(inode, le32_to_cpu(di->i_gid)); set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime); inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; brelse(bh); unlock_new_inode(inode); return inode; error: iget_failed(inode); return ERR_PTR(-EIO); } static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p) { if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) { printf("Bad inode number %s:%08x\n", sb->s_id, ino); return ERR_PTR(-EIO); } ino -= BFS_ROOT_INO; *p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK); if (!*p) { printf("Unable to read inode %s:%08x\n", sb->s_id, ino); return ERR_PTR(-EIO); } return (struct bfs_inode *)(*p)->b_data + ino % BFS_INODES_PER_BLOCK; } static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; unsigned long i_sblock; struct bfs_inode *di; struct buffer_head *bh; int err = 0; dprintf("ino=%08x\n", ino); di = find_inode(inode->i_sb, ino, &bh); if (IS_ERR(di)) return PTR_ERR(di); mutex_lock(&info->bfs_lock); if (ino == BFS_ROOT_INO) di->i_vtype = cpu_to_le32(BFS_VDIR); else di->i_vtype = cpu_to_le32(BFS_VREG); di->i_ino = cpu_to_le16(ino); di->i_mode = cpu_to_le32(inode->i_mode); di->i_uid = cpu_to_le32(i_uid_read(inode)); di->i_gid = cpu_to_le32(i_gid_read(inode)); di->i_nlink = cpu_to_le32(inode->i_nlink); di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); if (wbc->sync_mode == WB_SYNC_ALL) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) err = -EIO; } brelse(bh); mutex_unlock(&info->bfs_lock); return err; } static void bfs_evict_inode(struct inode *inode) { unsigned long ino = inode->i_ino; struct bfs_inode *di; struct buffer_head *bh; struct super_block *s = inode->i_sb; struct bfs_sb_info *info = BFS_SB(s); struct bfs_inode_info *bi = BFS_I(inode); dprintf("ino=%08lx\n", ino); truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); if (inode->i_nlink) return; di = find_inode(s, inode->i_ino, &bh); if (IS_ERR(di)) return; mutex_lock(&info->bfs_lock); /* clear on-disk inode */ memset(di, 0, sizeof(struct bfs_inode)); mark_buffer_dirty(bh); brelse(bh); if (bi->i_dsk_ino) { if (bi->i_sblock) info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; info->si_freei++; clear_bit(ino, info->si_imap); bfs_dump_imap("delete_inode", s); } /* * If this was the last file, make the previous block * "last block of the last file" even if there is no * real file there, saves us 1 gap. */ if (info->si_lf_eblk == bi->i_eblock) info->si_lf_eblk = bi->i_sblock - 1; mutex_unlock(&info->bfs_lock); } static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); if (!info) return; mutex_destroy(&info->bfs_lock); kfree(info->si_imap); kfree(info); s->s_fs_info = NULL; } static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct bfs_sb_info *info = BFS_SB(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); buf->f_type = BFS_MAGIC; buf->f_bsize = s->s_blocksize; buf->f_blocks = info->si_blocks; buf->f_bfree = buf->f_bavail = info->si_freeb; buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO; buf->f_ffree = info->si_freei; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = BFS_NAMELEN; return 0; } static struct kmem_cache *bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) { struct bfs_inode_info *bi; bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; } static void bfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } static void bfs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, bfs_i_callback); } static void init_once(void *foo) { struct bfs_inode_info *bi = foo; inode_init_once(&bi->vfs_inode); } static int __init init_inodecache(void) { bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(bfs_inode_cachep); } static const struct super_operations bfs_sops = { .alloc_inode = bfs_alloc_inode, .destroy_inode = bfs_destroy_inode, .write_inode = bfs_write_inode, .evict_inode = bfs_evict_inode, .put_super = bfs_put_super, .statfs = bfs_statfs, }; void bfs_dump_imap(const char *prefix, struct super_block *s) { #ifdef DEBUG int i; char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL); if (!tmpbuf) return; for (i = BFS_SB(s)->si_lasti; i >= 0; i--) { if (i > PAGE_SIZE - 100) break; if (test_bit(i, BFS_SB(s)->si_imap)) strcat(tmpbuf, "1"); else strcat(tmpbuf, "0"); } printf("BFS-fs: %s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf); free_page((unsigned long)tmpbuf); #endif } static int bfs_fill_super(struct super_block *s, void *data, int silent) { struct buffer_head *bh, *sbh; struct bfs_super_block *bfs_sb; struct inode *inode; unsigned i, imap_len; struct bfs_sb_info *info; int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; mutex_init(&info->bfs_lock); s->s_fs_info = info; sb_set_blocksize(s, BFS_BSIZE); sbh = sb_bread(s, 0); if (!sbh) goto out; bfs_sb = (struct bfs_super_block *)sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) printf("%s is unclean, continuing\n", s->s_id); s->s_magic = BFS_MAGIC; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) { printf("Superblock is corrupted\n"); goto out1; } info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; imap_len = (info->si_lasti / 8) + 1; info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN); if (!info->si_imap) { printf("Cannot allocate %u bytes\n", imap_len); goto out1; } for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); s->s_op = &bfs_sops; inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out2; } s->s_root = d_make_root(inode); if (!s->s_root) { ret = -ENOMEM; goto out2; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; info->si_freei = 0; info->si_lf_eblk = 0; /* can we read the last block? */ bh = sb_bread(s, info->si_blocks - 1); if (!bh) { printf("Last block not available: %lu\n", info->si_blocks - 1); ret = -EIO; goto out3; } brelse(bh); bh = NULL; for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { struct bfs_inode *di; int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; unsigned long eblock; if (!off) { brelse(bh); bh = sb_bread(s, block); } if (!bh) continue; di = (struct bfs_inode *)bh->b_data + off; /* test if filesystem is not corrupted */ i_eoff = le32_to_cpu(di->i_eoffset); i_sblock = le32_to_cpu(di->i_sblock); i_eblock = le32_to_cpu(di->i_eblock); s_size = le32_to_cpu(bfs_sb->s_end); if (i_sblock > info->si_blocks || i_eblock > info->si_blocks || i_sblock > i_eblock || (i_eoff != le32_to_cpu(-1) && i_eoff > s_size) || i_sblock * BFS_BSIZE > i_eoff) { printf("Inode 0x%08x corrupted\n", i); brelse(bh); ret = -EIO; goto out3; } if (!di->i_ino) { info->si_freei++; continue; } set_bit(i, info->si_imap); info->si_freeb -= BFS_FILEBLOCKS(di); eblock = le32_to_cpu(di->i_eblock); if (eblock > info->si_lf_eblk) info->si_lf_eblk = eblock; } brelse(bh); brelse(sbh); bfs_dump_imap("read_super", s); return 0; out3: dput(s->s_root); s->s_root = NULL; out2: kfree(info->si_imap); out1: brelse(sbh); out: mutex_destroy(&info->bfs_lock); kfree(info); s->s_fs_info = NULL; return ret; } static struct dentry *bfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super); } static struct file_system_type bfs_fs_type = { .owner = THIS_MODULE, .name = "bfs", .mount = bfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("bfs"); static int __init init_bfs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&bfs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_bfs_fs(void) { unregister_filesystem(&bfs_fs_type); destroy_inodecache(); } module_init(init_bfs_fs) module_exit(exit_bfs_fs) |
197 129 226 236 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <asm/unaligned.h> #include "ctree.h" static inline u8 get_unaligned_le8(const void *p) { return *(u8 *)p; } static inline void put_unaligned_le8(u8 val, void *p) { *(u8 *)p = val; } /* * this is some deeply nasty code. * * The end result is that anyone who #includes ctree.h gets a * declaration for the btrfs_set_foo functions and btrfs_foo functions, * which are wrappers of btrfs_set_token_#bits functions and * btrfs_get_token_#bits functions, which are defined in this file. * * These setget functions do all the extent_buffer related mapping * required to efficiently read and write specific fields in the extent * buffers. Every pointer to metadata items in btrfs is really just * an unsigned long offset into the extent buffer which has been * cast to a specific type. This gives us all the gcc type checking. * * The extent buffer api is used to do the page spanning work required to * have a metadata blocksize different from the page size. */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off, \ struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)ptr; \ unsigned long offset = part_offset + off; \ void *p; \ int err; \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ int size = sizeof(u##bits); \ u##bits res; \ \ if (token && token->kaddr && token->offset <= offset && \ token->eb == eb && \ (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ res = get_unaligned_le##bits(p + off); \ return res; \ } \ err = map_private_extent_buffer(eb, offset, size, \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits leres; \ \ read_extent_buffer(eb, &leres, offset, size); \ return le##bits##_to_cpu(leres); \ } \ p = kaddr + part_offset - map_start; \ res = get_unaligned_le##bits(p + off); \ if (token) { \ token->kaddr = kaddr; \ token->offset = map_start; \ token->eb = eb; \ } \ return res; \ } \ void btrfs_set_token_##bits(struct extent_buffer *eb, \ const void *ptr, unsigned long off, \ u##bits val, \ struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)ptr; \ unsigned long offset = part_offset + off; \ void *p; \ int err; \ char *kaddr; \ unsigned long map_start; \ unsigned long map_len; \ int size = sizeof(u##bits); \ \ if (token && token->kaddr && token->offset <= offset && \ token->eb == eb && \ (token->offset + PAGE_SIZE >= offset + size)) { \ kaddr = token->kaddr; \ p = kaddr + part_offset - token->offset; \ put_unaligned_le##bits(val, p + off); \ return; \ } \ err = map_private_extent_buffer(eb, offset, size, \ &kaddr, &map_start, &map_len); \ if (err) { \ __le##bits val2; \ \ val2 = cpu_to_le##bits(val); \ write_extent_buffer(eb, &val2, offset, size); \ return; \ } \ p = kaddr + part_offset - map_start; \ put_unaligned_le##bits(val, p + off); \ if (token) { \ token->kaddr = kaddr; \ token->offset = map_start; \ token->eb = eb; \ } \ } DEFINE_BTRFS_SETGET_BITS(8) DEFINE_BTRFS_SETGET_BITS(16) DEFINE_BTRFS_SETGET_BITS(32) DEFINE_BTRFS_SETGET_BITS(64) void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr = btrfs_node_key_ptr_offset(nr); read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } |
112 17 17 17 32 138 76 76 24 113 112 17 138 32 146 145 146 25 26 76 76 76 76 76 76 23 3 31 9 2 5 4 5 4 6 6 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 | /* * Dummy soundcard * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/init.h> #include <linux/err.h> #include <linux/platform_device.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/wait.h> #include <linux/hrtimer.h> #include <linux/math64.h> #include <linux/module.h> #include <sound/core.h> #include <sound/control.h> #include <sound/tlv.h> #include <sound/pcm.h> #include <sound/rawmidi.h> #include <sound/info.h> #include <sound/initval.h> MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Dummy soundcard (/dev/null)"); MODULE_LICENSE("GPL"); MODULE_SUPPORTED_DEVICE("{{ALSA,Dummy soundcard}}"); #define MAX_PCM_DEVICES 4 #define MAX_PCM_SUBSTREAMS 128 #define MAX_MIDI_DEVICES 2 /* defaults */ #define MAX_BUFFER_SIZE (64*1024) #define MIN_PERIOD_SIZE 64 #define MAX_PERIOD_SIZE MAX_BUFFER_SIZE #define USE_FORMATS (SNDRV_PCM_FMTBIT_U8 | SNDRV_PCM_FMTBIT_S16_LE) #define USE_RATE SNDRV_PCM_RATE_CONTINUOUS | SNDRV_PCM_RATE_8000_48000 #define USE_RATE_MIN 5500 #define USE_RATE_MAX 48000 #define USE_CHANNELS_MIN 1 #define USE_CHANNELS_MAX 2 #define USE_PERIODS_MIN 1 #define USE_PERIODS_MAX 1024 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-MAX */ static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* ID for this card */ static bool enable[SNDRV_CARDS] = {1, [1 ... (SNDRV_CARDS - 1)] = 0}; static char *model[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = NULL}; static int pcm_devs[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = 1}; static int pcm_substreams[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = 8}; //static int midi_devs[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = 2}; #ifdef CONFIG_HIGH_RES_TIMERS static bool hrtimer = 1; #endif static bool fake_buffer = 1; module_param_array(index, int, NULL, 0444); MODULE_PARM_DESC(index, "Index value for dummy soundcard."); module_param_array(id, charp, NULL, 0444); MODULE_PARM_DESC(id, "ID string for dummy soundcard."); module_param_array(enable, bool, NULL, 0444); MODULE_PARM_DESC(enable, "Enable this dummy soundcard."); module_param_array(model, charp, NULL, 0444); MODULE_PARM_DESC(model, "Soundcard model."); module_param_array(pcm_devs, int, NULL, 0444); MODULE_PARM_DESC(pcm_devs, "PCM devices # (0-4) for dummy driver."); module_param_array(pcm_substreams, int, NULL, 0444); MODULE_PARM_DESC(pcm_substreams, "PCM substreams # (1-128) for dummy driver."); //module_param_array(midi_devs, int, NULL, 0444); //MODULE_PARM_DESC(midi_devs, "MIDI devices # (0-2) for dummy driver."); module_param(fake_buffer, bool, 0444); MODULE_PARM_DESC(fake_buffer, "Fake buffer allocations."); #ifdef CONFIG_HIGH_RES_TIMERS module_param(hrtimer, bool, 0644); MODULE_PARM_DESC(hrtimer, "Use hrtimer as the timer source."); #endif static struct platform_device *devices[SNDRV_CARDS]; #define MIXER_ADDR_MASTER 0 #define MIXER_ADDR_LINE 1 #define MIXER_ADDR_MIC 2 #define MIXER_ADDR_SYNTH 3 #define MIXER_ADDR_CD 4 #define MIXER_ADDR_LAST 4 struct dummy_timer_ops { int (*create)(struct snd_pcm_substream *); void (*free)(struct snd_pcm_substream *); int (*prepare)(struct snd_pcm_substream *); int (*start)(struct snd_pcm_substream *); int (*stop)(struct snd_pcm_substream *); snd_pcm_uframes_t (*pointer)(struct snd_pcm_substream *); }; #define get_dummy_ops(substream) \ (*(const struct dummy_timer_ops **)(substream)->runtime->private_data) struct dummy_model { const char *name; int (*playback_constraints)(struct snd_pcm_runtime *runtime); int (*capture_constraints)(struct snd_pcm_runtime *runtime); u64 formats; size_t buffer_bytes_max; size_t period_bytes_min; size_t period_bytes_max; unsigned int periods_min; unsigned int periods_max; unsigned int rates; unsigned int rate_min; unsigned int rate_max; unsigned int channels_min; unsigned int channels_max; }; struct snd_dummy { struct snd_card *card; struct dummy_model *model; struct snd_pcm *pcm; struct snd_pcm_hardware pcm_hw; spinlock_t mixer_lock; int mixer_volume[MIXER_ADDR_LAST+1][2]; int capture_source[MIXER_ADDR_LAST+1][2]; int iobox; struct snd_kcontrol *cd_volume_ctl; struct snd_kcontrol *cd_switch_ctl; }; /* * card models */ static int emu10k1_playback_constraints(struct snd_pcm_runtime *runtime) { int err; err = snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 256, UINT_MAX); if (err < 0) return err; return 0; } static struct dummy_model model_emu10k1 = { .name = "emu10k1", .playback_constraints = emu10k1_playback_constraints, .buffer_bytes_max = 128 * 1024, }; static struct dummy_model model_rme9652 = { .name = "rme9652", .buffer_bytes_max = 26 * 64 * 1024, .formats = SNDRV_PCM_FMTBIT_S32_LE, .channels_min = 26, .channels_max = 26, .periods_min = 2, .periods_max = 2, }; static struct dummy_model model_ice1712 = { .name = "ice1712", .buffer_bytes_max = 256 * 1024, .formats = SNDRV_PCM_FMTBIT_S32_LE, .channels_min = 10, .channels_max = 10, .periods_min = 1, .periods_max = 1024, }; static struct dummy_model model_uda1341 = { .name = "uda1341", .buffer_bytes_max = 16380, .formats = SNDRV_PCM_FMTBIT_S16_LE, .channels_min = 2, .channels_max = 2, .periods_min = 2, .periods_max = 255, }; static struct dummy_model model_ac97 = { .name = "ac97", .formats = SNDRV_PCM_FMTBIT_S16_LE, .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000, .rate_min = 48000, .rate_max = 48000, }; static struct dummy_model model_ca0106 = { .name = "ca0106", .formats = SNDRV_PCM_FMTBIT_S16_LE, .buffer_bytes_max = ((65536-64)*8), .period_bytes_max = (65536-64), .periods_min = 2, .periods_max = 8, .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000|SNDRV_PCM_RATE_96000|SNDRV_PCM_RATE_192000, .rate_min = 48000, .rate_max = 192000, }; static struct dummy_model *dummy_models[] = { &model_emu10k1, &model_rme9652, &model_ice1712, &model_uda1341, &model_ac97, &model_ca0106, NULL }; /* * system timer interface */ struct dummy_systimer_pcm { /* ops must be the first item */ const struct dummy_timer_ops *timer_ops; spinlock_t lock; struct timer_list timer; unsigned long base_time; unsigned int frac_pos; /* fractional sample position (based HZ) */ unsigned int frac_period_rest; unsigned int frac_buffer_size; /* buffer_size * HZ */ unsigned int frac_period_size; /* period_size * HZ */ unsigned int rate; int elapsed; struct snd_pcm_substream *substream; }; static void dummy_systimer_rearm(struct dummy_systimer_pcm *dpcm) { mod_timer(&dpcm->timer, jiffies + (dpcm->frac_period_rest + dpcm->rate - 1) / dpcm->rate); } static void dummy_systimer_update(struct dummy_systimer_pcm *dpcm) { unsigned long delta; delta = jiffies - dpcm->base_time; if (!delta) return; dpcm->base_time += delta; delta *= dpcm->rate; dpcm->frac_pos += delta; while (dpcm->frac_pos >= dpcm->frac_buffer_size) dpcm->frac_pos -= dpcm->frac_buffer_size; while (dpcm->frac_period_rest <= delta) { dpcm->elapsed++; dpcm->frac_period_rest += dpcm->frac_period_size; } dpcm->frac_period_rest -= delta; } static int dummy_systimer_start(struct snd_pcm_substream *substream) { struct dummy_systimer_pcm *dpcm = substream->runtime->private_data; spin_lock(&dpcm->lock); dpcm->base_time = jiffies; dummy_systimer_rearm(dpcm); spin_unlock(&dpcm->lock); return 0; } static int dummy_systimer_stop(struct snd_pcm_substream *substream) { struct dummy_systimer_pcm *dpcm = substream->runtime->private_data; spin_lock(&dpcm->lock); del_timer(&dpcm->timer); spin_unlock(&dpcm->lock); return 0; } static int dummy_systimer_prepare(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct dummy_systimer_pcm *dpcm = runtime->private_data; dpcm->frac_pos = 0; dpcm->rate = runtime->rate; dpcm->frac_buffer_size = runtime->buffer_size * HZ; dpcm->frac_period_size = runtime->period_size * HZ; dpcm->frac_period_rest = dpcm->frac_period_size; dpcm->elapsed = 0; return 0; } static void dummy_systimer_callback(struct timer_list *t) { struct dummy_systimer_pcm *dpcm = from_timer(dpcm, t, timer); unsigned long flags; int elapsed = 0; spin_lock_irqsave(&dpcm->lock, flags); dummy_systimer_update(dpcm); dummy_systimer_rearm(dpcm); elapsed = dpcm->elapsed; dpcm->elapsed = 0; spin_unlock_irqrestore(&dpcm->lock, flags); if (elapsed) snd_pcm_period_elapsed(dpcm->substream); } static snd_pcm_uframes_t dummy_systimer_pointer(struct snd_pcm_substream *substream) { struct dummy_systimer_pcm *dpcm = substream->runtime->private_data; snd_pcm_uframes_t pos; spin_lock(&dpcm->lock); dummy_systimer_update(dpcm); pos = dpcm->frac_pos / HZ; spin_unlock(&dpcm->lock); return pos; } static int dummy_systimer_create(struct snd_pcm_substream *substream) { struct dummy_systimer_pcm *dpcm; dpcm = kzalloc(sizeof(*dpcm), GFP_KERNEL); if (!dpcm) return -ENOMEM; substream->runtime->private_data = dpcm; timer_setup(&dpcm->timer, dummy_systimer_callback, 0); spin_lock_init(&dpcm->lock); dpcm->substream = substream; return 0; } static void dummy_systimer_free(struct snd_pcm_substream *substream) { kfree(substream->runtime->private_data); } static const struct dummy_timer_ops dummy_systimer_ops = { .create = dummy_systimer_create, .free = dummy_systimer_free, .prepare = dummy_systimer_prepare, .start = dummy_systimer_start, .stop = dummy_systimer_stop, .pointer = dummy_systimer_pointer, }; #ifdef CONFIG_HIGH_RES_TIMERS /* * hrtimer interface */ struct dummy_hrtimer_pcm { /* ops must be the first item */ const struct dummy_timer_ops *timer_ops; ktime_t base_time; ktime_t period_time; atomic_t running; struct hrtimer timer; struct snd_pcm_substream *substream; }; static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer) { struct dummy_hrtimer_pcm *dpcm; dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer); if (!atomic_read(&dpcm->running)) return HRTIMER_NORESTART; /* * In cases of XRUN and draining, this calls .trigger to stop PCM * substream. */ snd_pcm_period_elapsed(dpcm->substream); if (!atomic_read(&dpcm->running)) return HRTIMER_NORESTART; hrtimer_forward_now(timer, dpcm->period_time); return HRTIMER_RESTART; } static int dummy_hrtimer_start(struct snd_pcm_substream *substream) { struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer); hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT); atomic_set(&dpcm->running, 1); return 0; } static int dummy_hrtimer_stop(struct snd_pcm_substream *substream) { struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; atomic_set(&dpcm->running, 0); if (!hrtimer_callback_running(&dpcm->timer)) hrtimer_cancel(&dpcm->timer); return 0; } static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm) { hrtimer_cancel(&dpcm->timer); } static snd_pcm_uframes_t dummy_hrtimer_pointer(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct dummy_hrtimer_pcm *dpcm = runtime->private_data; u64 delta; u32 pos; delta = ktime_us_delta(hrtimer_cb_get_time(&dpcm->timer), dpcm->base_time); delta = div_u64(delta * runtime->rate + 999999, 1000000); div_u64_rem(delta, runtime->buffer_size, &pos); return pos; } static int dummy_hrtimer_prepare(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct dummy_hrtimer_pcm *dpcm = runtime->private_data; unsigned int period, rate; long sec; unsigned long nsecs; dummy_hrtimer_sync(dpcm); period = runtime->period_size; rate = runtime->rate; sec = period / rate; period %= rate; nsecs = div_u64((u64)period * 1000000000UL + rate - 1, rate); dpcm->period_time = ktime_set(sec, nsecs); return 0; } static int dummy_hrtimer_create(struct snd_pcm_substream *substream) { struct dummy_hrtimer_pcm *dpcm; dpcm = kzalloc(sizeof(*dpcm), GFP_KERNEL); if (!dpcm) return -ENOMEM; substream->runtime->private_data = dpcm; hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); dpcm->timer.function = dummy_hrtimer_callback; dpcm->substream = substream; atomic_set(&dpcm->running, 0); return 0; } static void dummy_hrtimer_free(struct snd_pcm_substream *substream) { struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; dummy_hrtimer_sync(dpcm); kfree(dpcm); } static const struct dummy_timer_ops dummy_hrtimer_ops = { .create = dummy_hrtimer_create, .free = dummy_hrtimer_free, .prepare = dummy_hrtimer_prepare, .start = dummy_hrtimer_start, .stop = dummy_hrtimer_stop, .pointer = dummy_hrtimer_pointer, }; #endif /* CONFIG_HIGH_RES_TIMERS */ /* * PCM interface */ static int dummy_pcm_trigger(struct snd_pcm_substream *substream, int cmd) { switch (cmd) { case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_RESUME: return get_dummy_ops(substream)->start(substream); case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: return get_dummy_ops(substream)->stop(substream); } return -EINVAL; } static int dummy_pcm_prepare(struct snd_pcm_substream *substream) { return get_dummy_ops(substream)->prepare(substream); } static snd_pcm_uframes_t dummy_pcm_pointer(struct snd_pcm_substream *substream) { return get_dummy_ops(substream)->pointer(substream); } static const struct snd_pcm_hardware dummy_pcm_hardware = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_RESUME | SNDRV_PCM_INFO_MMAP_VALID), .formats = USE_FORMATS, .rates = USE_RATE, .rate_min = USE_RATE_MIN, .rate_max = USE_RATE_MAX, .channels_min = USE_CHANNELS_MIN, .channels_max = USE_CHANNELS_MAX, .buffer_bytes_max = MAX_BUFFER_SIZE, .period_bytes_min = MIN_PERIOD_SIZE, .period_bytes_max = MAX_PERIOD_SIZE, .periods_min = USE_PERIODS_MIN, .periods_max = USE_PERIODS_MAX, .fifo_size = 0, }; static int dummy_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *hw_params) { if (fake_buffer) { /* runtime->dma_bytes has to be set manually to allow mmap */ substream->runtime->dma_bytes = params_buffer_bytes(hw_params); return 0; } return snd_pcm_lib_malloc_pages(substream, params_buffer_bytes(hw_params)); } static int dummy_pcm_hw_free(struct snd_pcm_substream *substream) { if (fake_buffer) return 0; return snd_pcm_lib_free_pages(substream); } static int dummy_pcm_open(struct snd_pcm_substream *substream) { struct snd_dummy *dummy = snd_pcm_substream_chip(substream); struct dummy_model *model = dummy->model; struct snd_pcm_runtime *runtime = substream->runtime; const struct dummy_timer_ops *ops; int err; ops = &dummy_systimer_ops; #ifdef CONFIG_HIGH_RES_TIMERS if (hrtimer) ops = &dummy_hrtimer_ops; #endif err = ops->create(substream); if (err < 0) return err; get_dummy_ops(substream) = ops; runtime->hw = dummy->pcm_hw; if (substream->pcm->device & 1) { runtime->hw.info &= ~SNDRV_PCM_INFO_INTERLEAVED; runtime->hw.info |= SNDRV_PCM_INFO_NONINTERLEAVED; } if (substream->pcm->device & 2) runtime->hw.info &= ~(SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID); if (model == NULL) return 0; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { if (model->playback_constraints) err = model->playback_constraints(substream->runtime); } else { if (model->capture_constraints) err = model->capture_constraints(substream->runtime); } if (err < 0) { get_dummy_ops(substream)->free(substream); return err; } return 0; } static int dummy_pcm_close(struct snd_pcm_substream *substream) { get_dummy_ops(substream)->free(substream); return 0; } /* * dummy buffer handling */ static void *dummy_page[2]; static void free_fake_buffer(void) { if (fake_buffer) { int i; for (i = 0; i < 2; i++) if (dummy_page[i]) { free_page((unsigned long)dummy_page[i]); dummy_page[i] = NULL; } } } static int alloc_fake_buffer(void) { int i; if (!fake_buffer) return 0; for (i = 0; i < 2; i++) { dummy_page[i] = (void *)get_zeroed_page(GFP_KERNEL); if (!dummy_page[i]) { free_fake_buffer(); return -ENOMEM; } } return 0; } static int dummy_pcm_copy(struct snd_pcm_substream *substream, int channel, unsigned long pos, void __user *dst, unsigned long bytes) { return 0; /* do nothing */ } static int dummy_pcm_copy_kernel(struct snd_pcm_substream *substream, int channel, unsigned long pos, void *dst, unsigned long bytes) { return 0; /* do nothing */ } static int dummy_pcm_silence(struct snd_pcm_substream *substream, int channel, unsigned long pos, unsigned long bytes) { return 0; /* do nothing */ } static struct page *dummy_pcm_page(struct snd_pcm_substream *substream, unsigned long offset) { return virt_to_page(dummy_page[substream->stream]); /* the same page */ } static struct snd_pcm_ops dummy_pcm_ops = { .open = dummy_pcm_open, .close = dummy_pcm_close, .ioctl = snd_pcm_lib_ioctl, .hw_params = dummy_pcm_hw_params, .hw_free = dummy_pcm_hw_free, .prepare = dummy_pcm_prepare, .trigger = dummy_pcm_trigger, .pointer = dummy_pcm_pointer, }; static struct snd_pcm_ops dummy_pcm_ops_no_buf = { .open = dummy_pcm_open, .close = dummy_pcm_close, .ioctl = snd_pcm_lib_ioctl, .hw_params = dummy_pcm_hw_params, .hw_free = dummy_pcm_hw_free, .prepare = dummy_pcm_prepare, .trigger = dummy_pcm_trigger, .pointer = dummy_pcm_pointer, .copy_user = dummy_pcm_copy, .copy_kernel = dummy_pcm_copy_kernel, .fill_silence = dummy_pcm_silence, .page = dummy_pcm_page, }; static int snd_card_dummy_pcm(struct snd_dummy *dummy, int device, int substreams) { struct snd_pcm *pcm; struct snd_pcm_ops *ops; int err; err = snd_pcm_new(dummy->card, "Dummy PCM", device, substreams, substreams, &pcm); if (err < 0) return err; dummy->pcm = pcm; if (fake_buffer) ops = &dummy_pcm_ops_no_buf; else ops = &dummy_pcm_ops; snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, ops); snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, ops); pcm->private_data = dummy; pcm->info_flags = 0; strcpy(pcm->name, "Dummy PCM"); if (!fake_buffer) { snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_CONTINUOUS, snd_dma_continuous_data(GFP_KERNEL), 0, 64*1024); } return 0; } /* * mixer interface */ #define DUMMY_VOLUME(xname, xindex, addr) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, \ .access = SNDRV_CTL_ELEM_ACCESS_READWRITE | SNDRV_CTL_ELEM_ACCESS_TLV_READ, \ .name = xname, .index = xindex, \ .info = snd_dummy_volume_info, \ .get = snd_dummy_volume_get, .put = snd_dummy_volume_put, \ .private_value = addr, \ .tlv = { .p = db_scale_dummy } } static int snd_dummy_volume_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 2; uinfo->value.integer.min = -50; uinfo->value.integer.max = 100; return 0; } static int snd_dummy_volume_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int addr = kcontrol->private_value; spin_lock_irq(&dummy->mixer_lock); ucontrol->value.integer.value[0] = dummy->mixer_volume[addr][0]; ucontrol->value.integer.value[1] = dummy->mixer_volume[addr][1]; spin_unlock_irq(&dummy->mixer_lock); return 0; } static int snd_dummy_volume_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int change, addr = kcontrol->private_value; int left, right; left = ucontrol->value.integer.value[0]; if (left < -50) left = -50; if (left > 100) left = 100; right = ucontrol->value.integer.value[1]; if (right < -50) right = -50; if (right > 100) right = 100; spin_lock_irq(&dummy->mixer_lock); change = dummy->mixer_volume[addr][0] != left || dummy->mixer_volume[addr][1] != right; dummy->mixer_volume[addr][0] = left; dummy->mixer_volume[addr][1] = right; spin_unlock_irq(&dummy->mixer_lock); return change; } static const DECLARE_TLV_DB_SCALE(db_scale_dummy, -4500, 30, 0); #define DUMMY_CAPSRC(xname, xindex, addr) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, .index = xindex, \ .info = snd_dummy_capsrc_info, \ .get = snd_dummy_capsrc_get, .put = snd_dummy_capsrc_put, \ .private_value = addr } #define snd_dummy_capsrc_info snd_ctl_boolean_stereo_info static int snd_dummy_capsrc_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int addr = kcontrol->private_value; spin_lock_irq(&dummy->mixer_lock); ucontrol->value.integer.value[0] = dummy->capture_source[addr][0]; ucontrol->value.integer.value[1] = dummy->capture_source[addr][1]; spin_unlock_irq(&dummy->mixer_lock); return 0; } static int snd_dummy_capsrc_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int change, addr = kcontrol->private_value; int left, right; left = ucontrol->value.integer.value[0] & 1; right = ucontrol->value.integer.value[1] & 1; spin_lock_irq(&dummy->mixer_lock); change = dummy->capture_source[addr][0] != left && dummy->capture_source[addr][1] != right; dummy->capture_source[addr][0] = left; dummy->capture_source[addr][1] = right; spin_unlock_irq(&dummy->mixer_lock); return change; } static int snd_dummy_iobox_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *info) { static const char *const names[] = { "None", "CD Player" }; return snd_ctl_enum_info(info, 1, 2, names); } static int snd_dummy_iobox_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *value) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); value->value.enumerated.item[0] = dummy->iobox; return 0; } static int snd_dummy_iobox_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *value) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int changed; if (value->value.enumerated.item[0] > 1) return -EINVAL; changed = value->value.enumerated.item[0] != dummy->iobox; if (changed) { dummy->iobox = value->value.enumerated.item[0]; if (dummy->iobox) { dummy->cd_volume_ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; dummy->cd_switch_ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; } else { dummy->cd_volume_ctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; dummy->cd_switch_ctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; } snd_ctl_notify(dummy->card, SNDRV_CTL_EVENT_MASK_INFO, &dummy->cd_volume_ctl->id); snd_ctl_notify(dummy->card, SNDRV_CTL_EVENT_MASK_INFO, &dummy->cd_switch_ctl->id); } return changed; } static struct snd_kcontrol_new snd_dummy_controls[] = { DUMMY_VOLUME("Master Volume", 0, MIXER_ADDR_MASTER), DUMMY_CAPSRC("Master Capture Switch", 0, MIXER_ADDR_MASTER), DUMMY_VOLUME("Synth Volume", 0, MIXER_ADDR_SYNTH), DUMMY_CAPSRC("Synth Capture Switch", 0, MIXER_ADDR_SYNTH), DUMMY_VOLUME("Line Volume", 0, MIXER_ADDR_LINE), DUMMY_CAPSRC("Line Capture Switch", 0, MIXER_ADDR_LINE), DUMMY_VOLUME("Mic Volume", 0, MIXER_ADDR_MIC), DUMMY_CAPSRC("Mic Capture Switch", 0, MIXER_ADDR_MIC), DUMMY_VOLUME("CD Volume", 0, MIXER_ADDR_CD), DUMMY_CAPSRC("CD Capture Switch", 0, MIXER_ADDR_CD), { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "External I/O Box", .info = snd_dummy_iobox_info, .get = snd_dummy_iobox_get, .put = snd_dummy_iobox_put, }, }; static int snd_card_dummy_new_mixer(struct snd_dummy *dummy) { struct snd_card *card = dummy->card; struct snd_kcontrol *kcontrol; unsigned int idx; int err; spin_lock_init(&dummy->mixer_lock); strcpy(card->mixername, "Dummy Mixer"); dummy->iobox = 1; for (idx = 0; idx < ARRAY_SIZE(snd_dummy_controls); idx++) { kcontrol = snd_ctl_new1(&snd_dummy_controls[idx], dummy); err = snd_ctl_add(card, kcontrol); if (err < 0) return err; if (!strcmp(kcontrol->id.name, "CD Volume")) dummy->cd_volume_ctl = kcontrol; else if (!strcmp(kcontrol->id.name, "CD Capture Switch")) dummy->cd_switch_ctl = kcontrol; } return 0; } #if defined(CONFIG_SND_DEBUG) && defined(CONFIG_SND_PROC_FS) /* * proc interface */ static void print_formats(struct snd_dummy *dummy, struct snd_info_buffer *buffer) { int i; for (i = 0; i <= SNDRV_PCM_FORMAT_LAST; i++) { if (dummy->pcm_hw.formats & (1ULL << i)) snd_iprintf(buffer, " %s", snd_pcm_format_name(i)); } } static void print_rates(struct snd_dummy *dummy, struct snd_info_buffer *buffer) { static int rates[] = { 5512, 8000, 11025, 16000, 22050, 32000, 44100, 48000, 64000, 88200, 96000, 176400, 192000, }; int i; if (dummy->pcm_hw.rates & SNDRV_PCM_RATE_CONTINUOUS) snd_iprintf(buffer, " continuous"); if (dummy->pcm_hw.rates & SNDRV_PCM_RATE_KNOT) snd_iprintf(buffer, " knot"); for (i = 0; i < ARRAY_SIZE(rates); i++) if (dummy->pcm_hw.rates & (1 << i)) snd_iprintf(buffer, " %d", rates[i]); } #define get_dummy_int_ptr(dummy, ofs) \ (unsigned int *)((char *)&((dummy)->pcm_hw) + (ofs)) #define get_dummy_ll_ptr(dummy, ofs) \ (unsigned long long *)((char *)&((dummy)->pcm_hw) + (ofs)) struct dummy_hw_field { const char *name; const char *format; unsigned int offset; unsigned int size; }; #define FIELD_ENTRY(item, fmt) { \ .name = #item, \ .format = fmt, \ .offset = offsetof(struct snd_pcm_hardware, item), \ .size = sizeof(dummy_pcm_hardware.item) } static struct dummy_hw_field fields[] = { FIELD_ENTRY(formats, "%#llx"), FIELD_ENTRY(rates, "%#x"), FIELD_ENTRY(rate_min, "%d"), FIELD_ENTRY(rate_max, "%d"), FIELD_ENTRY(channels_min, "%d"), FIELD_ENTRY(channels_max, "%d"), FIELD_ENTRY(buffer_bytes_max, "%ld"), FIELD_ENTRY(period_bytes_min, "%ld"), FIELD_ENTRY(period_bytes_max, "%ld"), FIELD_ENTRY(periods_min, "%d"), FIELD_ENTRY(periods_max, "%d"), }; static void dummy_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_dummy *dummy = entry->private_data; int i; for (i = 0; i < ARRAY_SIZE(fields); i++) { snd_iprintf(buffer, "%s ", fields[i].name); if (fields[i].size == sizeof(int)) snd_iprintf(buffer, fields[i].format, *get_dummy_int_ptr(dummy, fields[i].offset)); else snd_iprintf(buffer, fields[i].format, *get_dummy_ll_ptr(dummy, fields[i].offset)); if (!strcmp(fields[i].name, "formats")) print_formats(dummy, buffer); else if (!strcmp(fields[i].name, "rates")) print_rates(dummy, buffer); snd_iprintf(buffer, "\n"); } } static void dummy_proc_write(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_dummy *dummy = entry->private_data; char line[64]; while (!snd_info_get_line(buffer, line, sizeof(line))) { char item[20]; const char *ptr; unsigned long long val; int i; ptr = snd_info_get_str(item, line, sizeof(item)); for (i = 0; i < ARRAY_SIZE(fields); i++) { if (!strcmp(item, fields[i].name)) break; } if (i >= ARRAY_SIZE(fields)) continue; snd_info_get_str(item, ptr, sizeof(item)); if (kstrtoull(item, 0, &val)) continue; if (fields[i].size == sizeof(int)) *get_dummy_int_ptr(dummy, fields[i].offset) = val; else *get_dummy_ll_ptr(dummy, fields[i].offset) = val; } } static void dummy_proc_init(struct snd_dummy *chip) { struct snd_info_entry *entry; if (!snd_card_proc_new(chip->card, "dummy_pcm", &entry)) { snd_info_set_text_ops(entry, chip, dummy_proc_read); entry->c.text.write = dummy_proc_write; entry->mode |= 0200; entry->private_data = chip; } } #else #define dummy_proc_init(x) #endif /* CONFIG_SND_DEBUG && CONFIG_SND_PROC_FS */ static int snd_dummy_probe(struct platform_device *devptr) { struct snd_card *card; struct snd_dummy *dummy; struct dummy_model *m = NULL, **mdl; int idx, err; int dev = devptr->id; err = snd_card_new(&devptr->dev, index[dev], id[dev], THIS_MODULE, sizeof(struct snd_dummy), &card); if (err < 0) return err; dummy = card->private_data; dummy->card = card; for (mdl = dummy_models; *mdl && model[dev]; mdl++) { if (strcmp(model[dev], (*mdl)->name) == 0) { printk(KERN_INFO "snd-dummy: Using model '%s' for card %i\n", (*mdl)->name, card->number); m = dummy->model = *mdl; break; } } for (idx = 0; idx < MAX_PCM_DEVICES && idx < pcm_devs[dev]; idx++) { if (pcm_substreams[dev] < 1) pcm_substreams[dev] = 1; if (pcm_substreams[dev] > MAX_PCM_SUBSTREAMS) pcm_substreams[dev] = MAX_PCM_SUBSTREAMS; err = snd_card_dummy_pcm(dummy, idx, pcm_substreams[dev]); if (err < 0) goto __nodev; } dummy->pcm_hw = dummy_pcm_hardware; if (m) { if (m->formats) dummy->pcm_hw.formats = m->formats; if (m->buffer_bytes_max) dummy->pcm_hw.buffer_bytes_max = m->buffer_bytes_max; if (m->period_bytes_min) dummy->pcm_hw.period_bytes_min = m->period_bytes_min; if (m->period_bytes_max) dummy->pcm_hw.period_bytes_max = m->period_bytes_max; if (m->periods_min) dummy->pcm_hw.periods_min = m->periods_min; if (m->periods_max) dummy->pcm_hw.periods_max = m->periods_max; if (m->rates) dummy->pcm_hw.rates = m->rates; if (m->rate_min) dummy->pcm_hw.rate_min = m->rate_min; if (m->rate_max) dummy->pcm_hw.rate_max = m->rate_max; if (m->channels_min) dummy->pcm_hw.channels_min = m->channels_min; if (m->channels_max) dummy->pcm_hw.channels_max = m->channels_max; } err = snd_card_dummy_new_mixer(dummy); if (err < 0) goto __nodev; strcpy(card->driver, "Dummy"); strcpy(card->shortname, "Dummy"); sprintf(card->longname, "Dummy %i", dev + 1); dummy_proc_init(dummy); err = snd_card_register(card); if (err == 0) { platform_set_drvdata(devptr, card); return 0; } __nodev: snd_card_free(card); return err; } static int snd_dummy_remove(struct platform_device *devptr) { snd_card_free(platform_get_drvdata(devptr)); return 0; } #ifdef CONFIG_PM_SLEEP static int snd_dummy_suspend(struct device *pdev) { struct snd_card *card = dev_get_drvdata(pdev); struct snd_dummy *dummy = card->private_data; snd_power_change_state(card, SNDRV_CTL_POWER_D3hot); snd_pcm_suspend_all(dummy->pcm); return 0; } static int snd_dummy_resume(struct device *pdev) { struct snd_card *card = dev_get_drvdata(pdev); snd_power_change_state(card, SNDRV_CTL_POWER_D0); return 0; } static SIMPLE_DEV_PM_OPS(snd_dummy_pm, snd_dummy_suspend, snd_dummy_resume); #define SND_DUMMY_PM_OPS &snd_dummy_pm #else #define SND_DUMMY_PM_OPS NULL #endif #define SND_DUMMY_DRIVER "snd_dummy" static struct platform_driver snd_dummy_driver = { .probe = snd_dummy_probe, .remove = snd_dummy_remove, .driver = { .name = SND_DUMMY_DRIVER, .pm = SND_DUMMY_PM_OPS, }, }; static void snd_dummy_unregister_all(void) { int i; for (i = 0; i < ARRAY_SIZE(devices); ++i) platform_device_unregister(devices[i]); platform_driver_unregister(&snd_dummy_driver); free_fake_buffer(); } static int __init alsa_card_dummy_init(void) { int i, cards, err; err = platform_driver_register(&snd_dummy_driver); if (err < 0) return err; err = alloc_fake_buffer(); if (err < 0) { platform_driver_unregister(&snd_dummy_driver); return err; } cards = 0; for (i = 0; i < SNDRV_CARDS; i++) { struct platform_device *device; if (! enable[i]) continue; device = platform_device_register_simple(SND_DUMMY_DRIVER, i, NULL, 0); if (IS_ERR(device)) continue; if (!platform_get_drvdata(device)) { platform_device_unregister(device); continue; } devices[i] = device; cards++; } if (!cards) { #ifdef MODULE printk(KERN_ERR "Dummy soundcard not found or device busy\n"); #endif snd_dummy_unregister_all(); return -ENODEV; } return 0; } static void __exit alsa_card_dummy_exit(void) { snd_dummy_unregister_all(); } module_init(alsa_card_dummy_init) module_exit(alsa_card_dummy_exit) |
92 92 1 92 92 92 1 91 87 2 85 1 10 10 74 80 1 79 91 1 90 88 88 11 11 92 24 24 24 16 16 24 6 5 6 4 111 96 4 111 2 28 28 27 27 27 27 27 27 16 16 16 16 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/btree.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Handle opening/closing btree */ #include <linux/slab.h> #include <linux/pagemap.h> #include <linux/log2.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" /* * Initial source code of clump size calculation is gotten * from http://opensource.apple.com/tarballs/diskdev_cmds/ */ #define CLUMP_ENTRIES 15 static short clumptbl[CLUMP_ENTRIES * 3] = { /* * Volume Attributes Catalog Extents * Size Clump (MB) Clump (MB) Clump (MB) */ /* 1GB */ 4, 4, 4, /* 2GB */ 6, 6, 4, /* 4GB */ 8, 8, 4, /* 8GB */ 11, 11, 5, /* * For volumes 16GB and larger, we want to make sure that a full OS * install won't require fragmentation of the Catalog or Attributes * B-trees. We do this by making the clump sizes sufficiently large, * and by leaving a gap after the B-trees for them to grow into. * * For SnowLeopard 10A298, a FullNetInstall with all packages selected * results in: * Catalog B-tree Header * nodeSize: 8192 * totalNodes: 31616 * freeNodes: 1978 * (used = 231.55 MB) * Attributes B-tree Header * nodeSize: 8192 * totalNodes: 63232 * freeNodes: 958 * (used = 486.52 MB) * * We also want Time Machine backup volumes to have a sufficiently * large clump size to reduce fragmentation. * * The series of numbers for Catalog and Attribute form a geometric * series. For Catalog (16GB to 512GB), each term is 8**(1/5) times * the previous term. For Attributes (16GB to 512GB), each term is * 4**(1/5) times the previous term. For 1TB to 16TB, each term is * 2**(1/5) times the previous term. */ /* 16GB */ 64, 32, 5, /* 32GB */ 84, 49, 6, /* 64GB */ 111, 74, 7, /* 128GB */ 147, 111, 8, /* 256GB */ 194, 169, 9, /* 512GB */ 256, 256, 11, /* 1TB */ 294, 294, 14, /* 2TB */ 338, 338, 16, /* 4TB */ 388, 388, 20, /* 8TB */ 446, 446, 25, /* 16TB */ 512, 512, 32 }; u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors, int file_id) { u32 mod = max(node_size, block_size); u32 clump_size; int column; int i; /* Figure out which column of the above table to use for this file. */ switch (file_id) { case HFSPLUS_ATTR_CNID: column = 0; break; case HFSPLUS_CAT_CNID: column = 1; break; default: column = 2; break; } /* * The default clump size is 0.8% of the volume size. And * it must also be a multiple of the node and block size. */ if (sectors < 0x200000) { clump_size = sectors << 2; /* 0.8 % */ if (clump_size < (8 * node_size)) clump_size = 8 * node_size; } else { /* turn exponent into table index... */ for (i = 0, sectors = sectors >> 22; sectors && (i < CLUMP_ENTRIES - 1); ++i, sectors = sectors >> 1) { /* empty body */ } clump_size = clumptbl[column + (i) * 3] * 1024 * 1024; } /* * Round the clump size to a multiple of node and block size. * NOTE: This rounds down. */ clump_size /= mod; clump_size *= mod; /* * Rounding down could have rounded down to 0 if the block size was * greater than the clump size. If so, just use one block or node. */ if (clump_size == 0) clump_size = mod; return clump_size; } /* Get a reference to a B*Tree and do some initial checks */ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) { struct hfs_btree *tree; struct hfs_btree_header_rec *head; struct address_space *mapping; struct inode *inode; struct page *page; unsigned int size; tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) return NULL; mutex_init(&tree->tree_lock); spin_lock_init(&tree->hash_lock); tree->sb = sb; tree->cnid = id; inode = hfsplus_iget(sb, id); if (IS_ERR(inode)) goto free_tree; tree->inode = inode; if (!HFSPLUS_I(tree->inode)->first_blocks) { pr_err("invalid btree extent records (0 size)\n"); goto free_inode; } mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) goto free_inode; /* Load the header */ head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); tree->root = be32_to_cpu(head->root); tree->leaf_count = be32_to_cpu(head->leaf_count); tree->leaf_head = be32_to_cpu(head->leaf_head); tree->leaf_tail = be32_to_cpu(head->leaf_tail); tree->node_count = be32_to_cpu(head->node_count); tree->free_nodes = be32_to_cpu(head->free_nodes); tree->attributes = be32_to_cpu(head->attributes); tree->node_size = be16_to_cpu(head->node_size); tree->max_key_len = be16_to_cpu(head->max_key_len); tree->depth = be16_to_cpu(head->depth); /* Verify the tree and set the correct compare function */ switch (id) { case HFSPLUS_EXT_CNID: if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) { pr_err("invalid extent max_key_len %d\n", tree->max_key_len); goto fail_page; } if (tree->attributes & HFS_TREE_VARIDXKEYS) { pr_err("invalid extent btree flag\n"); goto fail_page; } tree->keycmp = hfsplus_ext_cmp_key; break; case HFSPLUS_CAT_CNID: if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) { pr_err("invalid catalog max_key_len %d\n", tree->max_key_len); goto fail_page; } if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) { pr_err("invalid catalog btree flag\n"); goto fail_page; } if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) && (head->key_type == HFSPLUS_KEY_BINARY)) tree->keycmp = hfsplus_cat_bin_cmp_key; else { tree->keycmp = hfsplus_cat_case_cmp_key; set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); } break; case HFSPLUS_ATTR_CNID: if (tree->max_key_len != HFSPLUS_ATTR_KEYLEN - sizeof(u16)) { pr_err("invalid attributes max_key_len %d\n", tree->max_key_len); goto fail_page; } tree->keycmp = hfsplus_attr_bin_cmp_key; break; default: pr_err("unknown B*Tree requested\n"); goto fail_page; } if (!(tree->attributes & HFS_TREE_BIGKEYS)) { pr_err("invalid btree flag\n"); goto fail_page; } size = tree->node_size; if (!is_power_of_2(size)) goto fail_page; if (!tree->node_count) goto fail_page; tree->node_size_shift = ffs(size) - 1; tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT; kunmap(page); put_page(page); return tree; fail_page: put_page(page); free_inode: tree->inode->i_mapping->a_ops = &hfsplus_aops; iput(tree->inode); free_tree: kfree(tree); return NULL; } /* Release resources used by a btree */ void hfs_btree_close(struct hfs_btree *tree) { struct hfs_bnode *node; int i; if (!tree) return; for (i = 0; i < NODE_HASH_SIZE; i++) { while ((node = tree->node_hash[i])) { tree->node_hash[i] = node->next_hash; if (atomic_read(&node->refcnt)) pr_crit("node %d:%d " "still has %d user(s)!\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); hfs_bnode_free(node); tree->node_hash_cnt--; } } iput(tree->inode); kfree(tree); } int hfs_btree_write(struct hfs_btree *tree) { struct hfs_btree_header_rec *head; struct hfs_bnode *node; struct page *page; node = hfs_bnode_find(tree, 0); if (IS_ERR(node)) /* panic? */ return -EIO; /* Load the header */ page = node->page[0]; head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); head->root = cpu_to_be32(tree->root); head->leaf_count = cpu_to_be32(tree->leaf_count); head->leaf_head = cpu_to_be32(tree->leaf_head); head->leaf_tail = cpu_to_be32(tree->leaf_tail); head->node_count = cpu_to_be32(tree->node_count); head->free_nodes = cpu_to_be32(tree->free_nodes); head->attributes = cpu_to_be32(tree->attributes); head->depth = cpu_to_be16(tree->depth); kunmap(page); set_page_dirty(page); hfs_bnode_put(node); return 0; } static struct hfs_bnode *hfs_bmap_new_bmap(struct hfs_bnode *prev, u32 idx) { struct hfs_btree *tree = prev->tree; struct hfs_bnode *node; struct hfs_bnode_desc desc; __be32 cnid; node = hfs_bnode_create(tree, idx); if (IS_ERR(node)) return node; tree->free_nodes--; prev->next = idx; cnid = cpu_to_be32(idx); hfs_bnode_write(prev, &cnid, offsetof(struct hfs_bnode_desc, next), 4); node->type = HFS_NODE_MAP; node->num_recs = 1; hfs_bnode_clear(node, 0, tree->node_size); desc.next = 0; desc.prev = 0; desc.type = HFS_NODE_MAP; desc.height = 0; desc.num_recs = cpu_to_be16(1); desc.reserved = 0; hfs_bnode_write(node, &desc, 0, sizeof(desc)); hfs_bnode_write_u16(node, 14, 0x8000); hfs_bnode_write_u16(node, tree->node_size - 2, 14); hfs_bnode_write_u16(node, tree->node_size - 4, tree->node_size - 6); return node; } /* Make sure @tree has enough space for the @rsvd_nodes */ int hfs_bmap_reserve(struct hfs_btree *tree, int rsvd_nodes) { struct inode *inode = tree->inode; struct hfsplus_inode_info *hip = HFSPLUS_I(inode); u32 count; int res; if (rsvd_nodes <= 0) return 0; while (tree->free_nodes < rsvd_nodes) { res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree)); if (res) return res; hip->phys_size = inode->i_size = (loff_t)hip->alloc_blocks << HFSPLUS_SB(tree->sb)->alloc_blksz_shift; hip->fs_blocks = hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift; inode_set_bytes(inode, inode->i_size); count = inode->i_size >> tree->node_size_shift; tree->free_nodes += count - tree->node_count; tree->node_count = count; } return 0; } struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) { struct hfs_bnode *node, *next_node; struct page **pagep; u32 nidx, idx; unsigned off; u16 off16; u16 len; u8 *data, byte, m; int i, res; res = hfs_bmap_reserve(tree, 1); if (res) return ERR_PTR(res); nidx = 0; node = hfs_bnode_find(tree, nidx); if (IS_ERR(node)) return node; len = hfs_brec_lenoff(node, 2, &off16); off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); data = kmap(*pagep); off &= ~PAGE_MASK; idx = 0; for (;;) { while (len) { byte = data[off]; if (byte != 0xff) { for (m = 0x80, i = 0; i < 8; m >>= 1, i++) { if (!(byte & m)) { idx += i; data[off] |= m; set_page_dirty(*pagep); kunmap(*pagep); tree->free_nodes--; mark_inode_dirty(tree->inode); hfs_bnode_put(node); return hfs_bnode_create(tree, idx); } } } if (++off >= PAGE_SIZE) { kunmap(*pagep); data = kmap(*++pagep); off = 0; } idx += 8; len--; } kunmap(*pagep); nidx = node->next; if (!nidx) { hfs_dbg(BNODE_MOD, "create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); hfs_bnode_put(node); if (IS_ERR(next_node)) return next_node; node = next_node; len = hfs_brec_lenoff(node, 0, &off16); off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_SHIFT); data = kmap(*pagep); off &= ~PAGE_MASK; } } void hfs_bmap_free(struct hfs_bnode *node) { struct hfs_btree *tree; struct page *page; u16 off, len; u32 nidx; u8 *data, byte, m; hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); if (IS_ERR(node)) return; len = hfs_brec_lenoff(node, 2, &off); while (nidx >= len * 8) { u32 i; nidx -= len * 8; i = node->next; if (!i) { /* panic */; pr_crit("unable to free bnode %u. " "bmap not found!\n", node->this); hfs_bnode_put(node); return; } hfs_bnode_put(node); node = hfs_bnode_find(tree, i); if (IS_ERR(node)) return; if (node->type != HFS_NODE_MAP) { /* panic */; pr_crit("invalid bmap found! " "(%u,%d)\n", node->this, node->type); hfs_bnode_put(node); return; } len = hfs_brec_lenoff(node, 0, &off); } off += node->page_offset + nidx / 8; page = node->page[off >> PAGE_SHIFT]; data = kmap(page); off &= ~PAGE_MASK; m = 1 << (~nidx & 7); byte = data[off]; if (!(byte & m)) { pr_crit("trying to free free bnode " "%u(%d)\n", node->this, node->type); kunmap(page); hfs_bnode_put(node); return; } data[off] = byte & ~m; set_page_dirty(page); kunmap(page); hfs_bnode_put(node); tree->free_nodes++; mark_inode_dirty(tree->inode); } |
68 19 1 17 1 16 16 16 2 2 14 4 4 13 13 13 13 10 10 19 10 8 8 2 1 2444 2444 629 629 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 | /* * net/sched/act_mirred.c packet mirroring and redirect actions * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Authors: Jamal Hadi Salim (2002-4) * * TODO: Add ingress support (and socket redirect support) * */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> #include <linux/if_arp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_mirred.h> #include <net/tc_act/tc_mirred.h> static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; } static bool tcf_mirred_act_wants_ingress(int action) { switch (action) { case TCA_EGRESS_REDIR: case TCA_EGRESS_MIRROR: return false; case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: return true; default: BUG(); } } static bool tcf_mirred_can_reinsert(int action) { switch (action) { case TC_ACT_SHOT: case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: return true; } return false; } static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m) { return rcu_dereference_protected(m->tcfm_dev, lockdep_is_held(&m->tcf_lock)); } static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; spin_lock(&mirred_list_lock); list_del(&m->tcfm_list); spin_unlock(&mirred_list_lock); /* last reference to action, no need to lock */ dev = rcu_dereference_protected(m->tcfm_dev, 1); if (dev) dev_put(dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; bool mac_header_xmit = false; struct tc_mirred *parm; struct tcf_mirred *m; struct net_device *dev; bool exists = false; int ret, err; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); return -EINVAL; } ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack); if (ret < 0) return ret; if (!tb[TCA_MIRRED_PARMS]) { NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters"); return -EINVAL; } parm = nla_data(tb[TCA_MIRRED_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return 0; switch (parm->eaction) { case TCA_EGRESS_MIRROR: case TCA_EGRESS_REDIR: case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: break; default: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } if (!exists) { if (!parm->ifindex) { tcf_idr_cleanup(tn, index); NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } ret = tcf_idr_create(tn, index, est, a, &act_mirred_ops, bind, true); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } m = to_mirred(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&m->tcfm_list); spin_lock_bh(&m->tcf_lock); m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; if (parm->ifindex) { dev = dev_get_by_index(net, parm->ifindex); if (!dev) { spin_unlock_bh(&m->tcf_lock); tcf_idr_release(*a, bind); return -ENODEV; } mac_header_xmit = dev_is_mac_header_xmit(dev); rcu_swap_protected(m->tcfm_dev, dev, lockdep_is_held(&m->tcf_lock)); if (dev) dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; } spin_unlock_bh(&m->tcf_lock); if (ret == ACT_P_CREATED) { spin_lock(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock(&mirred_list_lock); tcf_idr_insert(tn, *a); } return ret; } static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; int retval, err = 0; bool use_reinsert; bool want_ingress; bool is_redirect; int m_eaction; int mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = READ_ONCE(m->tcf_action); dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); goto out; } if (unlikely(!(dev->flags & IFF_UP))) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto out; } /* we could easily avoid the clone only if called by ingress and clsact; * since we can't easily detect the clsact caller, skip clone only for * ingress - that covers the TC S/W datapath. */ is_redirect = tcf_mirred_is_act_redirect(m_eaction); use_reinsert = skb_at_tc_ingress(skb) && is_redirect && tcf_mirred_can_reinsert(retval); if (!use_reinsert) { skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out; } /* If action's target direction differs than filter's direction, * and devices expect a mac header on xmit, then mac push/pull is * needed. */ want_ingress = tcf_mirred_act_wants_ingress(m_eaction); if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) { if (!skb_at_tc_ingress(skb)) { /* caught at egress, act ingress: pull mac */ mac_len = skb_network_header(skb) - skb_mac_header(skb); skb_pull_rcsum(skb2, mac_len); } else { /* caught at ingress, act egress: push mac */ skb_push_rcsum(skb2, skb->mac_len); } } skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; /* mirror is always swallowed */ if (is_redirect) { skb2->tc_redirected = 1; skb2->tc_from_ingress = skb2->tc_at_ingress; /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { res->ingress = want_ingress; res->qstats = this_cpu_ptr(m->common.cpu_qstats); return TC_ACT_REINSERT; } } if (!want_ingress) err = dev_queue_xmit(skb2); else err = netif_receive_skb(skb2); if (err) { out: qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } return retval; } static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, u64 lastuse) { struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); struct tc_mirred opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, }; struct net_device *dev; struct tcf_t t; spin_lock_bh(&m->tcf_lock); opt.action = m->tcf_action; opt.eaction = m->tcfm_eaction; dev = tcf_mirred_dev_dereference(m); if (dev) opt.ifindex = dev->ifindex; if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; spin_unlock_bh(&m->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&m->tcf_lock); nlmsg_trim(skb, b); return -1; } static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); return tcf_idr_search(tn, a, index); } static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) { spin_lock(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { spin_lock_bh(&m->tcf_lock); if (tcf_mirred_dev_dereference(m) == dev) { dev_put(dev); /* Note : no rcu grace period necessary, as * net_device are already rcu protected. */ RCU_INIT_POINTER(m->tcfm_dev, NULL); } spin_unlock_bh(&m->tcf_lock); } spin_unlock(&mirred_list_lock); } return NOTIFY_DONE; } static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(m->tcfm_dev); if (dev) dev_hold(dev); rcu_read_unlock(); return dev; } static void tcf_mirred_put_dev(struct net_device *dev) { dev_put(dev); } static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, .owner = THIS_MODULE, .act = tcf_mirred_act, .stats_update = tcf_stats_update, .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, .put_dev = tcf_mirred_put_dev, }; static __net_init int mirred_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, mirred_net_id); return tc_action_net_init(net, tn, &act_mirred_ops); } static void __net_exit mirred_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, mirred_net_id); } static struct pernet_operations mirred_net_ops = { .init = mirred_init_net, .exit_batch = mirred_exit_net, .id = &mirred_net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); MODULE_DESCRIPTION("Device Mirror/redirect actions"); MODULE_LICENSE("GPL"); static int __init mirred_init_module(void) { int err = register_netdevice_notifier(&mirred_device_notifier); if (err) return err; pr_info("Mirror/redirect action on\n"); err = tcf_register_action(&act_mirred_ops, &mirred_net_ops); if (err) unregister_netdevice_notifier(&mirred_device_notifier); return err; } static void __exit mirred_cleanup_module(void) { tcf_unregister_action(&act_mirred_ops, &mirred_net_ops); unregister_netdevice_notifier(&mirred_device_notifier); } module_init(mirred_init_module); module_exit(mirred_cleanup_module); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 | /* * linux/mm/memory_hotplug.c * * Copyright (C) */ #include <linux/stddef.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/swap.h> #include <linux/interrupt.h> #include <linux/pagemap.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/memory.h> #include <linux/memremap.h> #include <linux/memory_hotplug.h> #include <linux/highmem.h> #include <linux/vmalloc.h> #include <linux/ioport.h> #include <linux/delay.h> #include <linux/migrate.h> #include <linux/page-isolation.h> #include <linux/pfn.h> #include <linux/suspend.h> #include <linux/mm_inline.h> #include <linux/firmware-map.h> #include <linux/stop_machine.h> #include <linux/hugetlb.h> #include <linux/memblock.h> #include <linux/bootmem.h> #include <linux/compaction.h> #include <linux/rmap.h> #include <asm/tlbflush.h> #include "internal.h" /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be * changed by calling set_online_page_callback() for callback registration * and restore_online_page_callback() for generic callback restore. */ static void generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock); void get_online_mems(void) { percpu_down_read(&mem_hotplug_lock); } void put_online_mems(void) { percpu_up_read(&mem_hotplug_lock); } bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; #else bool memhp_auto_online = true; #endif EXPORT_SYMBOL_GPL(memhp_auto_online); static int __init setup_memhp_default_state(char *str) { if (!strcmp(str, "online")) memhp_auto_online = true; else if (!strcmp(str, "offline")) memhp_auto_online = false; return 1; } __setup("memhp_default_state=", setup_memhp_default_state); void mem_hotplug_begin(void) { cpus_read_lock(); percpu_down_write(&mem_hotplug_lock); } void mem_hotplug_done(void) { percpu_up_write(&mem_hotplug_lock); cpus_read_unlock(); } /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { struct resource *res, *conflict; res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); res->name = "System RAM"; res->start = start; res->end = start + size - 1; res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; conflict = request_resource_conflict(&iomem_resource, res); if (conflict) { if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { pr_debug("Device unaddressable memory block " "memory hotplug at %#010llx !\n", (unsigned long long)start); } pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); return ERR_PTR(-EEXIST); } return res; } static void release_memory_resource(struct resource *res) { if (!res) return; release_resource(res); kfree(res); return; } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) { page->freelist = (void *)type; SetPagePrivate(page); set_page_private(page, info); page_ref_inc(page); } void put_page_bootmem(struct page *page) { unsigned long type; type = (unsigned long) page->freelist; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { page->freelist = NULL; ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); free_reserved_page(page); } } #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE #ifndef CONFIG_SPARSEMEM_VMEMMAP static void register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long *usemap, mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); /* Get section's memmap address */ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); /* * Get page for the memmap's phys address * XXX: need more consideration for sparse_vmemmap... */ page = virt_to_page(memmap); mapsize = sizeof(struct page) * PAGES_PER_SECTION; mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; /* remember memmap's page */ for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, SECTION_INFO); usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } #else /* CONFIG_SPARSEMEM_VMEMMAP */ static void register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long *usemap, mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) { unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); /* register section info */ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { /* * Some platforms can assign the same pfn to multiple nodes - on * node0 as well as nodeN. To avoid registering a pfn against * multiple nodes we check that this pfn does not already * reside in some other nodes. */ if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) register_page_bootmem_info_section(pfn); } } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, struct vmem_altmap *altmap, bool want_memblock) { int ret; if (pfn_valid(phys_start_pfn)) return -EEXIST; ret = sparse_add_one_section(nid, phys_start_pfn, altmap); return ret < 0 ? ret : 0; } /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will * call this function after deciding the zone to which to * add the new pages. */ int __ref __add_pages(int nid, unsigned long phys_start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap, bool want_memblock) { unsigned long i; int err = 0; int start_sec, end_sec; /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); if (altmap) { /* * Validate altmap is within bounds of the total request */ if (altmap->base_pfn != phys_start_pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); err = -EINVAL; goto out; } altmap->alloc = 0; } for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, section_nr_to_pfn(i), altmap, want_memblock); /* * EEXIST is finally dealt with by ioresource collision * check. see add_memory() => register_memory_resource() * Warning will be printed if there is collision. */ if (err && (err != -EEXIST)) break; err = 0; cond_resched(); } vmemmap_populate_print_last(); out: return err; } /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { if (unlikely(!pfn_to_online_page(start_pfn))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) continue; if (zone && zone != page_zone(pfn_to_page(start_pfn))) continue; return start_pfn; } return 0; } /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { if (unlikely(!pfn_to_online_page(pfn))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) continue; if (zone && zone != page_zone(pfn_to_page(pfn))) continue; return pfn; } return 0; } static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { unsigned long zone_start_pfn = zone->zone_start_pfn; unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ unsigned long zone_end_pfn = z; unsigned long pfn; int nid = zone_to_nid(zone); zone_span_writelock(zone); if (zone_start_pfn == start_pfn) { /* * If the section is smallest section in the zone, it need * shrink zone->zone_start_pfn and zone->zone_spanned_pages. * In this case, we find second smallest valid mem_section * for shrinking zone. */ pfn = find_smallest_section_pfn(nid, zone, end_pfn, zone_end_pfn); if (pfn) { zone->zone_start_pfn = pfn; zone->spanned_pages = zone_end_pfn - pfn; } } else if (zone_end_pfn == end_pfn) { /* * If the section is biggest section in the zone, it need * shrink zone->spanned_pages. * In this case, we find second biggest valid mem_section for * shrinking zone. */ pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, start_pfn); if (pfn) zone->spanned_pages = pfn - zone_start_pfn + 1; } /* * The section is not biggest or smallest mem_section in the zone, it * only creates a hole in the zone. So in this case, we need not * change the zone. But perhaps, the zone has only hole data. Thus * it check the zone has only hole or not. */ pfn = zone_start_pfn; for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { if (unlikely(!pfn_to_online_page(pfn))) continue; if (page_zone(pfn_to_page(pfn)) != zone) continue; /* If the section is current section, it continues the loop */ if (start_pfn == pfn) continue; /* If we find valid section, we have nothing to do */ zone_span_writeunlock(zone); return; } /* The zone has no valid section */ zone->zone_start_pfn = 0; zone->spanned_pages = 0; zone_span_writeunlock(zone); } static void update_pgdat_span(struct pglist_data *pgdat) { unsigned long node_start_pfn = 0, node_end_pfn = 0; struct zone *zone; for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; /* No need to lock the zones, they can't change. */ if (!zone->spanned_pages) continue; if (!node_end_pfn) { node_start_pfn = zone->zone_start_pfn; node_end_pfn = zone_end_pfn; continue; } if (zone_end_pfn > node_end_pfn) node_end_pfn = zone_end_pfn; if (zone->zone_start_pfn < node_start_pfn) node_start_pfn = zone->zone_start_pfn; } pgdat->node_start_pfn = node_start_pfn; pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } void __ref remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long flags; #ifdef CONFIG_ZONE_DEVICE /* * Zone shrinking code cannot properly deal with ZONE_DEVICE. So * we will not try to shrink the zones - which is okay as * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. */ if (zone_idx(zone) == ZONE_DEVICE) return; #endif clear_zone_contiguous(zone); pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); update_pgdat_span(pgdat); pgdat_resize_unlock(zone->zone_pgdat, &flags); set_zone_contiguous(zone); } static void __remove_section(struct mem_section *ms, unsigned long map_offset, struct vmem_altmap *altmap) { unsigned long start_pfn; int scn_nr; if (WARN_ON_ONCE(!valid_section(ms))) return; scn_nr = __section_nr(ms); start_pfn = section_nr_to_pfn((unsigned long)scn_nr); sparse_remove_one_section(ms, map_offset, altmap); } /** * __remove_pages() - remove sections of pages * @phys_start_pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ void __remove_pages(unsigned long phys_start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long i; unsigned long map_offset = 0; int sections_to_remove; if (altmap) map_offset = vmem_altmap_offset(altmap); /* * We can only remove entire sections */ BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; cond_resched(); __remove_section(__pfn_to_section(pfn), map_offset, altmap); map_offset = 0; } } int set_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; get_online_mems(); mutex_lock(&online_page_callback_lock); if (online_page_callback == generic_online_page) { online_page_callback = callback; rc = 0; } mutex_unlock(&online_page_callback_lock); put_online_mems(); return rc; } EXPORT_SYMBOL_GPL(set_online_page_callback); int restore_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; get_online_mems(); mutex_lock(&online_page_callback_lock); if (online_page_callback == callback) { online_page_callback = generic_online_page; rc = 0; } mutex_unlock(&online_page_callback_lock); put_online_mems(); return rc; } EXPORT_SYMBOL_GPL(restore_online_page_callback); void __online_page_set_limits(struct page *page) { } EXPORT_SYMBOL_GPL(__online_page_set_limits); void __online_page_increment_counters(struct page *page) { adjust_managed_page_count(page, 1); } EXPORT_SYMBOL_GPL(__online_page_increment_counters); void __online_page_free(struct page *page) { __free_reserved_page(page); } EXPORT_SYMBOL_GPL(__online_page_free); static void generic_online_page(struct page *page) { __online_page_set_limits(page); __online_page_increment_counters(page); __online_page_free(page); } static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; struct page *page; if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); (*online_page_callback)(page); onlined_pages++; } online_mem_sections(start_pfn, start_pfn + nr_pages); *(unsigned long *)arg = onlined_pages; return 0; } /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { int nid = zone_to_nid(zone); enum zone_type zone_last = ZONE_NORMAL; /* * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] * contains nodes which have zones of 0...ZONE_NORMAL, * set zone_last to ZONE_NORMAL. * * If we don't have HIGHMEM nor movable node, * node_states[N_NORMAL_MEMORY] contains nodes which have zones of * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. */ if (N_MEMORY == N_NORMAL_MEMORY) zone_last = ZONE_MOVABLE; /* * if the memory to be online is in a zone of 0...zone_last, and * the zones of 0...zone_last don't have memory before online, we will * need to set the node to node_states[N_NORMAL_MEMORY] after * the memory is online. */ if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; else arg->status_change_nid_normal = -1; #ifdef CONFIG_HIGHMEM /* * If we have movable node, node_states[N_HIGH_MEMORY] * contains nodes which have zones of 0...ZONE_HIGHMEM, * set zone_last to ZONE_HIGHMEM. * * If we don't have movable node, node_states[N_NORMAL_MEMORY] * contains nodes which have zones of 0...ZONE_MOVABLE, * set zone_last to ZONE_MOVABLE. */ zone_last = ZONE_HIGHMEM; if (N_MEMORY == N_HIGH_MEMORY) zone_last = ZONE_MOVABLE; if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) arg->status_change_nid_high = nid; else arg->status_change_nid_high = -1; #else arg->status_change_nid_high = arg->status_change_nid_normal; #endif /* * if the node don't have memory befor online, we will need to * set the node to node_states[N_MEMORY] after the memory * is online. */ if (!node_state(nid, N_MEMORY)) arg->status_change_nid = nid; else arg->status_change_nid = -1; } static void node_states_set_node(int node, struct memory_notify *arg) { if (arg->status_change_nid_normal >= 0) node_set_state(node, N_NORMAL_MEMORY); if (arg->status_change_nid_high >= 0) node_set_state(node, N_HIGH_MEMORY); node_set_state(node, N_MEMORY); } static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { unsigned long old_end_pfn = zone_end_pfn(zone); if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; } static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long nr_pages) { unsigned long old_end_pfn = pgdat_end_pfn(pgdat); if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; } void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; unsigned long flags; if (zone_is_empty(zone)) init_currently_empty_zone(zone, start_pfn, nr_pages); clear_zone_contiguous(zone); /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ pgdat_resize_lock(pgdat, &flags); zone_span_writelock(zone); resize_zone_range(zone, start_pfn, nr_pages); zone_span_writeunlock(zone); resize_pgdat_range(pgdat, start_pfn, nr_pages); pgdat_resize_unlock(pgdat, &flags); /* * TODO now we have a visible range of pages which are not associated * with their zone properly. Not nice but set_pfnblock_flags_mask * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMINIT_HOTPLUG, altmap); set_zone_contiguous(zone); } /* * Returns a default kernel memory zone for the given pfn range. * If no kernel zone covers this pfn range it will automatically go * to the ZONE_NORMAL. */ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = NODE_DATA(nid); int zid; for (zid = 0; zid <= ZONE_NORMAL; zid++) { struct zone *zone = &pgdat->node_zones[zid]; if (zone_intersects(zone, start_pfn, nr_pages)) return zone; } return &pgdat->node_zones[ZONE_NORMAL]; } static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); /* * We inherit the existing zone in a simple case where zones do not * overlap in the given range */ if (in_kernel ^ in_movable) return (in_kernel) ? kernel_zone : movable_zone; /* * If the range doesn't belong to any zone or two zones overlap in the * given range then we use movable zone only if movable_node is * enabled because we always online to a kernel zone by default. */ return movable_node_enabled ? movable_zone : kernel_zone; } struct zone *zone_for_pfn_range(int online_type, int nid, unsigned long start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); if (online_type == MMOP_ONLINE_MOVABLE) return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; return default_zone_for_pfn(nid, start_pfn, nr_pages); } /* * Associates the given pfn range with the given node and the zone appropriate * for the given online type. */ static struct zone * __meminit move_pfn_range(int online_type, int nid, unsigned long start_pfn, unsigned long nr_pages) { struct zone *zone; zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL); return zone; } int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; int nid; int ret; struct memory_notify arg; struct memory_block *mem; mem_hotplug_begin(); /* * We can't use pfn_to_nid() because nid might be stored in struct page * which is not yet initialized. Instead, we find nid from memory block. */ mem = find_memory_block(__pfn_to_section(pfn)); nid = mem->nid; put_device(&mem->dev); /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) goto failed_addition; /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ if (!populated_zone(zone)) { need_zonelists_rebuild = 1; setup_zone_pageset(zone); } ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { if (need_zonelists_rebuild) zone_pcp_reset(zone); goto failed_addition; } zone->present_pages += onlined_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages += onlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL); else zone_pcp_update(zone); } init_per_zone_wmark_min(); if (onlined_pages) { kswapd_run(nid); kcompactd_run(nid); } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); if (onlined_pages) memory_notify(MEM_ONLINE, &arg); mem_hotplug_done(); return 0; failed_addition: pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); remove_pfn_range_from_zone(zone, pfn, nr_pages); mem_hotplug_done(); return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ static void reset_node_present_pages(pg_data_t *pgdat) { struct zone *z; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->present_pages = 0; pgdat->node_present_pages = 0; } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { struct pglist_data *pgdat; unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { pgdat = arch_alloc_nodedata(nid); if (!pgdat) return NULL; arch_refresh_nodedata(nid, pgdat); } else { /* * Reset the nr_zones, order and classzone_idx before reuse. * Note that kswapd will init kswapd_classzone_idx properly * when it starts in the near future. */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; pgdat->kswapd_classzone_idx = 0; } /* we can use NODE_DATA(nid) from here */ pgdat->node_id = nid; pgdat->node_start_pfn = start_pfn; /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ build_all_zonelists(pgdat); /* * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). */ reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); return pgdat; } static void rollback_node_hotadd(int nid) { pg_data_t *pgdat = NODE_DATA(nid); arch_refresh_nodedata(nid, NULL); free_percpu(pgdat->per_cpu_nodestats); arch_free_nodedata(pgdat); return; } /** * try_online_node - online a node if offlined * @nid: the node ID * @start: start addr of the node * @set_node_online: Whether we want to online the node * called by cpu_up() to online a node without onlined memory. * * Returns: * 1 -> a new node has been allocated * 0 -> the node is already online * -ENOMEM -> the node could not be allocated */ static int __try_online_node(int nid, u64 start, bool set_node_online) { pg_data_t *pgdat; int ret = 1; if (node_online(nid)) return 0; pgdat = hotadd_new_pgdat(nid, start); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } if (set_node_online) { node_set_online(nid); ret = register_one_node(nid); BUG_ON(ret); } out: return ret; } /* * Users of this function always want to online/register the node */ int try_online_node(int nid) { int ret; mem_hotplug_begin(); ret = __try_online_node(nid, 0, true); mem_hotplug_done(); return ret; } static int check_hotplug_memory_range(u64 start, u64 size) { unsigned long block_sz = memory_block_size_bytes(); u64 block_nr_pages = block_sz >> PAGE_SHIFT; u64 nr_pages = size >> PAGE_SHIFT; u64 start_pfn = PFN_DOWN(start); /* memory range must be block size aligned */ if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) || !IS_ALIGNED(nr_pages, block_nr_pages)) { pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", block_sz, start, size); return -EINVAL; } return 0; } static int online_memory_block(struct memory_block *mem, void *arg) { return device_online(&mem->dev); } /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). * * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; bool new_node = false; int ret; start = res->start; size = resource_size(res); ret = check_hotplug_memory_range(start, size); if (ret) return ret; mem_hotplug_begin(); /* * Add new range to memblock so that when hotadd_new_pgdat() is called * to allocate new pgdat, get_pfn_range_for_nid() will be able to find * this new range and calculate total pages correctly. The range will * be removed at hot-remove time. */ memblock_add_node(start, size, nid); ret = __try_online_node(nid, start, false); if (ret < 0) goto error; new_node = ret; /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, NULL, true); if (ret < 0) goto error; /* create memory block devices after memory was added */ ret = create_memory_block_devices(start, size); if (ret) { arch_remove_memory(nid, start, size, NULL); goto error; } if (new_node) { /* If sysfs file of new node can't be created, cpu on the node * can't be hot-added. There is no rollback way now. * So, check by BUG_ON() to catch it reluctantly.. * We online node here. We can't roll back from here. */ node_set_online(nid); ret = __register_one_node(nid); BUG_ON(ret); } /* link memory sections under this node.*/ ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), MEMINIT_HOTPLUG); BUG_ON(ret); /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); /* device_online() will take the lock when calling online_pages() */ mem_hotplug_done(); /* online pages if requested */ if (online) walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, online_memory_block); return ret; error: /* rollback pgdat allocation and others */ if (new_node) rollback_node_hotadd(nid); memblock_remove(start, size); mem_hotplug_done(); return ret; } /* requires device_hotplug_lock, see add_memory_resource() */ int __ref __add_memory(int nid, u64 start, u64 size) { struct resource *res; int ret; res = register_memory_resource(start, size); if (IS_ERR(res)) return PTR_ERR(res); ret = add_memory_resource(nid, res, memhp_auto_online); if (ret < 0) release_memory_resource(res); return ret; } int add_memory(int nid, u64 start, u64 size) { int rc; lock_device_hotplug(); rc = __add_memory(nid, start, size); unlock_device_hotplug(); return rc; } EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE /* * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy * set and the size of the free page is given by page_order(). Using this, * the function determines if the pageblock contains only free pages. * Due to buddy contraints, a free page at least the size of a pageblock will * be located at the start of the pageblock */ static inline int pageblock_free(struct page *page) { return PageBuddy(page) && page_order(page) >= pageblock_order; } /* Return the pfn of the start of the next active pageblock after a given pfn */ static unsigned long next_active_pageblock(unsigned long pfn) { struct page *page = pfn_to_page(pfn); /* Ensure the starting page is pageblock-aligned */ BUG_ON(pfn & (pageblock_nr_pages - 1)); /* If the entire pageblock is free, move to the end of free page */ if (pageblock_free(page)) { int order; /* be careful. we don't have locks, page_order can be changed.*/ order = page_order(page); if ((order < MAX_ORDER) && (order >= pageblock_order)) return pfn + (1 << order); } return pfn + pageblock_nr_pages; } static bool is_pageblock_removable_nolock(unsigned long pfn) { struct page *page = pfn_to_page(pfn); struct zone *zone; /* * We have to be careful here because we are iterating over memory * sections which are not zone aware so we might end up outside of * the zone but still within the section. * We have to take care about the node as well. If the node is offline * its NODE_DATA will be NULL - see page_zone. */ if (!node_online(page_to_nid(page))) return false; zone = page_zone(page); pfn = page_to_pfn(page); if (!zone_spans_pfn(zone, pfn)) return false; return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); } /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { unsigned long end_pfn, pfn; end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); /* Check the starting page of each pageblock within the range */ for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { if (!is_pageblock_removable_nolock(pfn)) return false; cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ return true; } /* * Confirm all pages in a range [start, end) belong to the same zone. * When true, return its valid [start, end). */ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end) { unsigned long pfn, sec_end_pfn; unsigned long start, end; struct zone *zone = NULL; struct page *page; int i; for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); pfn < end_pfn; pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { /* Make sure the memory section is present first */ if (!present_section_nr(pfn_to_section_nr(pfn))) continue; for (; pfn < sec_end_pfn && pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) { i = 0; /* This is just a CONFIG_HOLES_IN_ZONE check.*/ while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; /* Check if we got outside of the zone */ if (zone && !zone_spans_pfn(zone, pfn + i)) return 0; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; if (!zone) start = pfn + i; zone = page_zone(page); end = pfn + MAX_ORDER_NR_PAGES; } } if (zone) { *valid_start = start; *valid_end = min(end, end_pfn); return 1; } else { return 0; } } /* * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, * non-lru movable pages and hugepages). We scan pfn because it's much * easier than scanning over linked list. This function returns the pfn * of the first found movable page if it's found, otherwise 0. */ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { unsigned long pfn; for (pfn = start; pfn < end; pfn++) { struct page *page, *head; unsigned long skip; if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); if (PageLRU(page)) return pfn; if (__PageMovable(page)) return pfn; if (!PageHuge(page)) continue; head = compound_head(page); if (hugepage_migration_supported(page_hstate(head)) && page_huge_active(head)) return pfn; skip = (1 << compound_order(head)) - (page - head); pfn += skip - 1; } return 0; } static struct page *new_node_page(struct page *page, unsigned long private) { int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; /* * try to allocate from a different node but reuse this node if there * are no other online nodes to be used (e.g. we are offlining a part * of the only existing node) */ node_clear(nid, nmask); if (nodes_empty(nmask)) node_set(nid, nmask); return new_page_nodemask(page, nid, &nmask); } #define NR_OFFLINE_AT_ONCE_PAGES (256) static int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page; int move_pages = NR_OFFLINE_AT_ONCE_PAGES; int not_managed = 0; int ret = 0; LIST_HEAD(source); for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) { if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); if (PageHuge(page)) { struct page *head = compound_head(page); pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; if (compound_order(head) > PFN_SECTION_SHIFT) { ret = -EBUSY; break; } if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) + hpage_nr_pages(page) - 1; /* * HWPoison pages have elevated reference counts so the migration would * fail on them. It also doesn't make any sense to migrate them in the * first place. Still try to unmap such a page in case it is still mapped * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep * the unmap as the catch all safety net). */ if (PageHWPoison(page)) { if (WARN_ON(PageLRU(page))) isolate_lru_page(page); if (page_mapped(page)) try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); continue; } if (!get_page_unless_zero(page)) continue; /* * We can skip free pages. And we can deal with pages on * LRU and non-lru movable pages. */ if (PageLRU(page)) ret = isolate_lru_page(page); else ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ put_page(page); list_add_tail(&page->lru, &source); move_pages--; if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); } else { #ifdef CONFIG_DEBUG_VM pr_alert("failed to isolate pfn %lx\n", pfn); dump_page(page, "isolation failed"); #endif put_page(page); /* Because we don't have big zone->lock. we should check this again here. */ if (page_count(page)) { not_managed++; ret = -EBUSY; break; } } } if (!list_empty(&source)) { if (not_managed) { putback_movable_pages(&source); goto out; } /* Allocate a new page from the nearest neighbor node */ ret = migrate_pages(&source, new_node_page, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); } out: return ret; } /* * remove from free_area[] and mark all as Reserved. */ static int offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, void *data) { __offline_isolated_pages(start, start + nr_pages); return 0; } static void offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, offline_isolated_pages_cb); } /* * Check all pages in range, recoreded as memory resource, are isolated. */ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { int ret; long offlined = *(long *)data; ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); offlined = nr_pages; if (!ret) *(long *)data += offlined; return ret; } static long check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) { long offlined = 0; int ret; ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, check_pages_isolated_cb); if (ret < 0) offlined = (long)ret; return offlined; } static int __init cmdline_parse_movable_node(char *p) { #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; #else pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); #endif return 0; } early_param("movable_node", cmdline_parse_movable_node); /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) { struct pglist_data *pgdat = zone->zone_pgdat; unsigned long present_pages = 0; enum zone_type zt, zone_last = ZONE_NORMAL; /* * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] * contains nodes which have zones of 0...ZONE_NORMAL, * set zone_last to ZONE_NORMAL. * * If we don't have HIGHMEM nor movable node, * node_states[N_NORMAL_MEMORY] contains nodes which have zones of * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. */ if (N_MEMORY == N_NORMAL_MEMORY) zone_last = ZONE_MOVABLE; /* * check whether node_states[N_NORMAL_MEMORY] will be changed. * If the memory to be offline is in a zone of 0...zone_last, * and it is the last present memory, 0...zone_last will * become empty after offline , thus we can determind we will * need to clear the node from node_states[N_NORMAL_MEMORY]. */ for (zt = 0; zt <= zone_last; zt++) present_pages += pgdat->node_zones[zt].present_pages; if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) arg->status_change_nid_normal = zone_to_nid(zone); else arg->status_change_nid_normal = -1; #ifdef CONFIG_HIGHMEM /* * If we have movable node, node_states[N_HIGH_MEMORY] * contains nodes which have zones of 0...ZONE_HIGHMEM, * set zone_last to ZONE_HIGHMEM. * * If we don't have movable node, node_states[N_NORMAL_MEMORY] * contains nodes which have zones of 0...ZONE_MOVABLE, * set zone_last to ZONE_MOVABLE. */ zone_last = ZONE_HIGHMEM; if (N_MEMORY == N_HIGH_MEMORY) zone_last = ZONE_MOVABLE; for (; zt <= zone_last; zt++) present_pages += pgdat->node_zones[zt].present_pages; if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) arg->status_change_nid_high = zone_to_nid(zone); else arg->status_change_nid_high = -1; #else arg->status_change_nid_high = arg->status_change_nid_normal; #endif /* * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE */ zone_last = ZONE_MOVABLE; /* * check whether node_states[N_HIGH_MEMORY] will be changed * If we try to offline the last present @nr_pages from the node, * we can determind we will need to clear the node from * node_states[N_HIGH_MEMORY]. */ for (; zt <= zone_last; zt++) present_pages += pgdat->node_zones[zt].present_pages; if (nr_pages >= present_pages) arg->status_change_nid = zone_to_nid(zone); else arg->status_change_nid = -1; } static void node_states_clear_node(int node, struct memory_notify *arg) { if (arg->status_change_nid_normal >= 0) node_clear_state(node, N_NORMAL_MEMORY); if ((N_MEMORY != N_NORMAL_MEMORY) && (arg->status_change_nid_high >= 0)) node_clear_state(node, N_HIGH_MEMORY); if ((N_MEMORY != N_HIGH_MEMORY) && (arg->status_change_nid >= 0)) node_clear_state(node, N_MEMORY); } static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, nr_pages; long offlined_pages; int ret, node; unsigned long flags; unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; /* at least, alignment against pageblock is necessary */ if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) return -EINVAL; if (!IS_ALIGNED(end_pfn, pageblock_nr_pages)) return -EINVAL; mem_hotplug_begin(); /* This makes hotplug much easier...and readable. we assume this for now. .*/ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) { mem_hotplug_done(); return -EINVAL; } zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); if (ret) { mem_hotplug_done(); return ret; } arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; node_states_check_changes_offline(nr_pages, zone, &arg); ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); if (ret) goto failed_removal; pfn = start_pfn; repeat: /* start memory hot removal */ ret = -EINTR; if (signal_pending(current)) goto failed_removal; cond_resched(); lru_add_drain_all(); drain_all_pages(zone); pfn = scan_movable_pages(start_pfn, end_pfn); if (pfn) { /* We have movable pages */ ret = do_migrate_range(pfn, end_pfn); goto repeat; } /* * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. */ ret = dissolve_free_huge_pages(start_pfn, end_pfn); if (ret) goto failed_removal; /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) goto repeat; pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); /* reset pagetype flags and makes migrate type to be MOVABLE */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); init_per_zone_wmark_min(); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); } else zone_pcp_update(zone); node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { kswapd_stop(node); kcompactd_stop(node); } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); remove_pfn_range_from_zone(zone, start_pfn, nr_pages); mem_hotplug_done(); return 0; failed_removal: pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); mem_hotplug_done(); return ret; } int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages); } #endif /* CONFIG_MEMORY_HOTREMOVE */ /** * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) * @start_pfn: start pfn of the memory range * @end_pfn: end pfn of the memory range * @arg: argument passed to func * @func: callback for each memory section walked * * This function walks through all present mem sections in range * [start_pfn, end_pfn) and call func on each mem section. * * Returns the return value of func. */ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)) { struct memory_block *mem = NULL; struct mem_section *section; unsigned long pfn, section_nr; int ret; for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { section_nr = pfn_to_section_nr(pfn); if (!present_section_nr(section_nr)) continue; section = __nr_to_section(section_nr); /* same memblock? */ if (mem) if ((section_nr >= mem->start_section_nr) && (section_nr <= mem->end_section_nr)) continue; mem = find_memory_block_hinted(section, mem); if (!mem) continue; ret = func(mem, arg); if (ret) { kobject_put(&mem->dev.kobj); return ret; } } if (mem) kobject_put(&mem->dev.kobj); return 0; } #ifdef CONFIG_MEMORY_HOTREMOVE static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); if (unlikely(ret)) { phys_addr_t beginpa, endpa; beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); } return ret; } static int check_cpu_on_node(pg_data_t *pgdat) { int cpu; for_each_present_cpu(cpu) { if (cpu_to_node(cpu) == pgdat->node_id) /* * the cpu on this node isn't removed, and we can't * offline this node. */ return -EBUSY; } return 0; } static void unmap_cpu_on_node(pg_data_t *pgdat) { #ifdef CONFIG_ACPI_NUMA int cpu; for_each_possible_cpu(cpu) if (cpu_to_node(cpu) == pgdat->node_id) numa_clear_node(cpu); #endif } static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) { int ret; ret = check_cpu_on_node(pgdat); if (ret) return ret; /* * the node will be offlined when we come here, so we can clear * the cpu_to_node() now. */ unmap_cpu_on_node(pgdat); return 0; } static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) { int nid = *(int *)arg; /* * If a memory block belongs to multiple nodes, the stored nid is not * reliable. However, such blocks are always online (e.g., cannot get * offlined) and, therefore, are still spanned by the node. */ return mem->nid == nid ? -EEXIST : 0; } /** * try_offline_node * @nid: the node ID * * Offline a node if all memory sections and cpus of the node are removed. * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call. */ void try_offline_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); int rc; /* * If the node still spans pages (especially ZONE_DEVICE), don't * offline it. A node spans memory after move_pfn_range_to_zone(), * e.g., after the memory block was onlined. */ if (pgdat->node_spanned_pages) return; /* * Especially offline memory blocks might not be spanned by the * node. They will get spanned by the node once they get onlined. * However, they link to the node in sysfs and can get onlined later. */ rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); if (rc) return; if (check_and_unmap_cpu_on_node(pgdat)) return; /* * all memory/cpu of this node are removed, we can offline this * node now. */ node_set_offline(nid); unregister_one_node(nid); } EXPORT_SYMBOL(try_offline_node); static void __release_memory_resource(resource_size_t start, resource_size_t size) { int ret; /* * When removing memory in the same granularity as it was added, * this function never fails. It might only fail if resources * have to be adjusted or split. We'll ignore the error, as * removing of memory cannot fail. */ ret = release_mem_region_adjustable(&iomem_resource, start, size); if (ret) { resource_size_t endres = start + size - 1; pr_warn("Unable to release resource <%pa-%pa> (%d)\n", &start, &endres, ret); } } /** * remove_memory * @nid: the node ID * @start: physical address of the region to remove * @size: size of the region to remove * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by * try_offline_node(). */ void __ref __remove_memory(int nid, u64 start, u64 size) { int ret; BUG_ON(check_hotplug_memory_range(start, size)); /* * All memory blocks must be offlined before removing memory. Check * whether all memory blocks in question are offline and trigger a BUG() * if this is not the case. */ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, check_memblock_offlined_cb); if (ret) BUG(); /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); memblock_free(start, size); memblock_remove(start, size); /* * Memory block device removal under the device_hotplug_lock is * a barrier against racing online attempts. */ remove_memory_block_devices(start, size); mem_hotplug_begin(); arch_remove_memory(nid, start, size, NULL); __release_memory_resource(start, size); try_offline_node(nid); mem_hotplug_done(); } void remove_memory(int nid, u64 start, u64 size) { lock_device_hotplug(); __remove_memory(nid, start, size); unlock_device_hotplug(); } EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ |
203 116 46 46 181 181 181 1469 1470 22 22 34 105 104 105 105 74 74 7 7 7 7 7 70 46 46 46 46 46 74 74 74 74 74 74 74 74 74 74 74 7 2 4372 4374 4375 21 21 21 4368 1 46 46 46 46 46 46 46 46 46 30 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 6 6 7 7 1 7 7 7 7 121 121 121 121 120 121 121 4172 334 4171 4219 4224 1539 1471 1390 4225 3204 1505 4170 22 22 22 72 72 380 351 2 351 380 9 9 5 9 9 1634 1486 1485 1486 788 788 787 788 577 373 6 378 517 518 518 518 518 517 4215 649 4214 12 4202 4199 4214 4134 7 7 7 7 7 7 7 7 7 7 7 55 4251 25 1544 4251 3097 3093 4198 4200 54 1545 1545 1545 1514 500 499 1544 4251 4253 4249 4249 3095 4231 15 4233 25 4230 65 4252 4250 4251 4249 4231 4232 4233 4250 4252 4168 4166 2813 4169 4251 9 9 3 3 93 93 93 93 93 93 93 4212 358 349 4213 4215 4216 108 108 105 105 105 105 105 105 105 105 108 80 1111 63 4294 4214 3491 3444 3363 3363 2329 3357 3362 3195 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 | /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> * - July2000 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 */ /* * This handles all read/write requests to block devices */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> #include <linux/list_sort.h> #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> #include <linux/debugfs.h> #include <linux/bpf.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; #endif EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ struct kmem_cache *request_cachep; /* * For queue allocation */ struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set * @q: request queue */ void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); queue_flag_set(flag, q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_queue_flag_set); /** * blk_queue_flag_clear - atomically clear a queue flag * @flag: flag to be cleared * @q: request queue */ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); queue_flag_clear(flag, q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_queue_flag_clear); /** * blk_queue_flag_test_and_set - atomically test and set a queue flag * @flag: flag to be set * @q: request queue * * Returns the previous value of @flag - 0 if the flag was not set and 1 if * the flag was already set. */ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) { unsigned long flags; bool res; spin_lock_irqsave(q->queue_lock, flags); res = queue_flag_test_and_set(flag, q); spin_unlock_irqrestore(q->queue_lock, flags); return res; } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); /** * blk_queue_flag_test_and_clear - atomically test and clear a queue flag * @flag: flag to be cleared * @q: request queue * * Returns the previous value of @flag - 0 if the flag was not set and 1 if * the flag was set. */ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q) { unsigned long flags; bool res; spin_lock_irqsave(q->queue_lock, flags); res = queue_flag_test_and_clear(flag, q); spin_unlock_irqrestore(q->queue_lock, flags); return res; } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear); static void blk_clear_congested(struct request_list *rl, int sync) { #ifdef CONFIG_CGROUP_WRITEBACK clear_wb_congested(rl->blkg->wb_congested, sync); #else /* * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't * flip its congestion state for events on other blkcgs. */ if (rl == &rl->q->root_rl) clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync); #endif } static void blk_set_congested(struct request_list *rl, int sync) { #ifdef CONFIG_CGROUP_WRITEBACK set_wb_congested(rl->blkg->wb_congested, sync); #else /* see blk_clear_congested() */ if (rl == &rl->q->root_rl) set_wb_congested(rl->q->backing_dev_info->wb.congested, sync); #endif } void blk_queue_congestion_threshold(struct request_queue *q) { int nr; nr = q->nr_requests - (q->nr_requests / 8) + 1; if (nr > q->nr_requests) nr = q->nr_requests; q->nr_congestion_on = nr; nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; if (nr < 1) nr = 1; q->nr_congestion_off = nr; } void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); INIT_LIST_HEAD(&rq->timeout_list); rq->cpu = -1; rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = -1; rq->internal_tag = -1; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; refcount_set(&rq->ref, 1); } EXPORT_SYMBOL(blk_rq_init); static const struct { int errno; const char *name; } blk_errors[] = { [BLK_STS_OK] = { 0, "" }, [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; blk_status_t errno_to_blk_status(int errno) { int i; for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { if (blk_errors[i].errno == errno) return (__force blk_status_t)i; } return BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(errno_to_blk_status); int blk_status_to_errno(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return -EIO; return blk_errors[idx].errno; } EXPORT_SYMBOL_GPL(blk_status_to_errno); static void print_req_error(struct request *req, blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return; printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", __func__, blk_errors[idx].name, req->rq_disk ? req->rq_disk->disk_name : "?", (unsigned long long)blk_rq_pos(req)); } static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, blk_status_t error) { if (error) bio->bi_status = error; if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->rq_disk ? rq->rq_disk->disk_name : "?", (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); } EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_delay_work(struct work_struct *work) { struct request_queue *q; q = container_of(work, struct request_queue, delay_work.work); spin_lock_irq(q->queue_lock); __blk_run_queue(q); spin_unlock_irq(q->queue_lock); } /** * blk_delay_queue - restart queueing after defined interval * @q: The &struct request_queue in question * @msecs: Delay in msecs * * Description: * Sometimes queueing needs to be postponed for a little while, to allow * resources to come back. This function will make sure that queueing is * restarted around the specified time. */ void blk_delay_queue(struct request_queue *q, unsigned long msecs) { lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); if (likely(!blk_queue_dead(q))) queue_delayed_work(kblockd_workqueue, &q->delay_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_delay_queue); /** * blk_start_queue_async - asynchronously restart a previously stopped queue * @q: The &struct request_queue in question * * Description: * blk_start_queue_async() will clear the stop flag on the queue, and * ensure that the request_fn for the queue is run from an async * context. **/ void blk_start_queue_async(struct request_queue *q) { lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); queue_flag_clear(QUEUE_FLAG_STOPPED, q); blk_run_queue_async(q); } EXPORT_SYMBOL(blk_start_queue_async); /** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question * * Description: * blk_start_queue() will clear the stop flag on the queue, and call * the request_fn for the queue if it was in a stopped state when * entered. Also see blk_stop_queue(). **/ void blk_start_queue(struct request_queue *q) { lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); } EXPORT_SYMBOL(blk_start_queue); /** * blk_stop_queue - stop a queue * @q: The &struct request_queue in question * * Description: * The Linux block layer assumes that a block driver will consume all * entries on the request queue when the request_fn strategy is called. * Often this will not happen, because of hardware limitations (queue * depth settings). If a device driver gets a 'queue full' response, * or if it simply chooses not to queue more I/O at one point, it can * call this function to prevent the request_fn from being called until * the driver has signalled it's ready to go again. This happens by calling * blk_start_queue() to restart queue operations. **/ void blk_stop_queue(struct request_queue *q) { lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); cancel_delayed_work(&q->delay_work); queue_flag_set(QUEUE_FLAG_STOPPED, q); } EXPORT_SYMBOL(blk_stop_queue); /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue * * Description: * The block layer may perform asynchronous callback activity * on a queue, such as calling the unplug function after a timeout. * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure * that its ->make_request_fn will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); cancel_work_sync(&q->timeout_work); if (q->mq_ops) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } else { cancel_delayed_work_sync(&q->delay_work); } } EXPORT_SYMBOL(blk_sync_queue); /** * blk_set_pm_only - increment pm_only counter * @q: request queue pointer */ void blk_set_pm_only(struct request_queue *q) { atomic_inc(&q->pm_only); } EXPORT_SYMBOL_GPL(blk_set_pm_only); void blk_clear_pm_only(struct request_queue *q) { int pm_only; pm_only = atomic_dec_return(&q->pm_only); WARN_ON_ONCE(pm_only < 0); if (pm_only == 0) wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_clear_pm_only); /** * __blk_run_queue_uncond - run a queue whether or not it has been stopped * @q: The queue to run * * Description: * Invoke request handling on a queue if there are any pending requests. * May be used to restart request handling after a request has completed. * This variant runs the queue whether or not the queue has been * stopped. Must be called with the queue lock held and interrupts * disabled. See also @blk_run_queue. */ inline void __blk_run_queue_uncond(struct request_queue *q) { lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); if (unlikely(blk_queue_dead(q))) return; /* * Some request_fn implementations, e.g. scsi_request_fn(), unlock * the queue lock internally. As a result multiple threads may be * running such a request function concurrently. Keep track of the * number of active request_fn invocations such that blk_drain_queue() * can wait until all these request_fn calls have finished. */ q->request_fn_active++; q->request_fn(q); q->request_fn_active--; } EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); /** * __blk_run_queue - run a single device queue * @q: The queue to run * * Description: * See @blk_run_queue. */ void __blk_run_queue(struct request_queue *q) { lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); if (unlikely(blk_queue_stopped(q))) return; __blk_run_queue_uncond(q); } EXPORT_SYMBOL(__blk_run_queue); /** * blk_run_queue_async - run a single device queue in workqueue context * @q: The queue to run * * Description: * Tells kblockd to perform the equivalent of @blk_run_queue on behalf * of us. * * Note: * Since it is not allowed to run q->delay_work after blk_cleanup_queue() * has canceled q->delay_work, callers must hold the queue lock to avoid * race conditions between blk_cleanup_queue() and blk_run_queue_async(). */ void blk_run_queue_async(struct request_queue *q) { lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); } EXPORT_SYMBOL(blk_run_queue_async); /** * blk_run_queue - run a single device queue * @q: The queue to run * * Description: * Invoke request handling on this queue, if it has pending work to do. * May be used to restart queueing when a request has completed. */ void blk_run_queue(struct request_queue *q) { unsigned long flags; WARN_ON_ONCE(q->mq_ops); spin_lock_irqsave(q->queue_lock, flags); __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_run_queue); void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); } EXPORT_SYMBOL(blk_put_queue); /** * __blk_drain_queue - drain requests from request_queue * @q: queue to drain * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV * * Drain requests from @q. If @drain_all is set, all requests are drained. * If not, only ELVPRIV requests are drained. The caller is responsible * for ensuring that no new requests which need to be drained are queued. */ static void __blk_drain_queue(struct request_queue *q, bool drain_all) __releases(q->queue_lock) __acquires(q->queue_lock) { int i; lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); while (true) { bool drain = false; /* * The caller might be trying to drain @q before its * elevator is initialized. */ if (q->elevator) elv_drain_elevator(q); blkcg_drain_queue(q); /* * This function might be called on a queue which failed * driver init after queue creation or is not yet fully * active yet. Some drivers (e.g. fd and loop) get unhappy * in such cases. Kick queue iff dispatch queue has * something on it and @q has request_fn set. */ if (!list_empty(&q->queue_head) && q->request_fn) __blk_run_queue(q); drain |= q->nr_rqs_elvpriv; drain |= q->request_fn_active; /* * Unfortunately, requests are queued at and tracked from * multiple places and there's no single counter which can * be drained. Check all the queues and counters. */ if (drain_all) { struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; if (fq) drain |= !list_empty(&fq->flush_queue[i]); } } if (!drain) break; spin_unlock_irq(q->queue_lock); msleep(10); spin_lock_irq(q->queue_lock); } /* * With queue marked dead, any woken up waiter will fail the * allocation path, so the wakeup chaining is lost and we're * left with hung waiters. We need to wake up those waiters. */ if (q->request_fn) { struct request_list *rl; blk_queue_for_each_rl(rl, q) for (i = 0; i < ARRAY_SIZE(rl->wait); i++) wake_up_all(&rl->wait[i]); } } void blk_drain_queue(struct request_queue *q) { spin_lock_irq(q->queue_lock); __blk_drain_queue(q, true); spin_unlock_irq(q->queue_lock); } /** * blk_queue_bypass_start - enter queue bypass mode * @q: queue of interest * * In bypass mode, only the dispatch FIFO queue of @q is used. This * function makes @q enter bypass mode and drains all requests which were * throttled or issued before. On return, it's guaranteed that no request * is being throttled or has ELVPRIV set and blk_queue_bypass() %true * inside queue or RCU read lock. */ void blk_queue_bypass_start(struct request_queue *q) { WARN_ON_ONCE(q->mq_ops); spin_lock_irq(q->queue_lock); q->bypass_depth++; queue_flag_set(QUEUE_FLAG_BYPASS, q); spin_unlock_irq(q->queue_lock); /* * Queues start drained. Skip actual draining till init is * complete. This avoids lenghty delays during queue init which * can happen many times during boot. */ if (blk_queue_init_done(q)) { spin_lock_irq(q->queue_lock); __blk_drain_queue(q, false); spin_unlock_irq(q->queue_lock); /* ensure blk_queue_bypass() is %true inside RCU read lock */ synchronize_rcu(); } } EXPORT_SYMBOL_GPL(blk_queue_bypass_start); /** * blk_queue_bypass_end - leave queue bypass mode * @q: queue of interest * * Leave bypass mode and restore the normal queueing behavior. * * Note: although blk_queue_bypass_start() is only called for blk-sq queues, * this function is called for both blk-sq and blk-mq queues. */ void blk_queue_bypass_end(struct request_queue *q) { spin_lock_irq(q->queue_lock); if (!--q->bypass_depth) queue_flag_clear(QUEUE_FLAG_BYPASS, q); WARN_ON_ONCE(q->bypass_depth < 0); spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_queue_bypass_end); void blk_set_queue_dying(struct request_queue *q) { blk_queue_flag_set(QUEUE_FLAG_DYING, q); /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ blk_freeze_queue_start(q); if (q->mq_ops) blk_mq_wake_waiters(q); else { struct request_list *rl; spin_lock_irq(q->queue_lock); blk_queue_for_each_rl(rl, q) { if (rl->rq_pool) { wake_up_all(&rl->wait[BLK_RW_SYNC]); wake_up_all(&rl->wait[BLK_RW_ASYNC]); } } spin_unlock_irq(q->queue_lock); } /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_set_queue_dying); /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ void blk_exit_queue(struct request_queue *q) { /* * Since the I/O scheduler exit code may access cgroup information, * perform I/O scheduler exit before disassociating from the block * cgroup controller. */ if (q->elevator) { ioc_clear_queue(q); elevator_exit(q, q->elevator); q->elevator = NULL; } /* * Remove all references to @q from the block cgroup controller before * restoring @q->queue_lock to avoid that restoring this pointer causes * e.g. blkcg_print_blkgs() to crash. */ blkcg_exit_queue(q); /* * Since the cgroup code may dereference the @q->backing_dev_info * pointer, only decrease its reference count after having removed the * association with the block cgroup controller. */ bdi_put(q->backing_dev_info); } /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. */ void blk_cleanup_queue(struct request_queue *q) { spinlock_t *lock = q->queue_lock; /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); spin_lock_irq(lock); /* * A dying queue is permanently in bypass mode till released. Note * that, unlike blk_queue_bypass_start(), we aren't performing * synchronize_rcu() after entering bypass mode to avoid the delay * as some drivers create and destroy a lot of queues while * probing. This is still safe because blk_release_queue() will be * called only after the queue refcnt drops to zero and nothing, * RCU or not, would be traversing the queue by then. */ q->bypass_depth++; queue_flag_set(QUEUE_FLAG_BYPASS, q); queue_flag_set(QUEUE_FLAG_NOMERGES, q); queue_flag_set(QUEUE_FLAG_NOXMERGES, q); queue_flag_set(QUEUE_FLAG_DYING, q); spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); /* * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that q->request_fn() gets invoked after draining finished. */ blk_freeze_queue(q); rq_qos_exit(q); spin_lock_irq(lock); queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); /* * make sure all in-progress dispatch are completed because * blk_freeze_queue() can only complete all requests, and * dispatch may still be in-progress since we dispatch requests * from more than one contexts. * * We rely on driver to deal with the race in case that queue * initialization isn't done. */ if (q->mq_ops && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); /* * I/O scheduler exit is only safe after the sysfs scheduler attribute * has been removed. */ WARN_ON_ONCE(q->kobj.state_in_sysfs); blk_exit_queue(q); if (q->mq_ops) blk_mq_exit_queue(q); percpu_ref_exit(&q->q_usage_counter); spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); /* Allocate memory local to the request queue */ static void *alloc_request_simple(gfp_t gfp_mask, void *data) { struct request_queue *q = data; return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node); } static void free_request_simple(void *element, void *data) { kmem_cache_free(request_cachep, element); } static void *alloc_request_size(gfp_t gfp_mask, void *data) { struct request_queue *q = data; struct request *rq; rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask, q->node); if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) { kfree(rq); rq = NULL; } return rq; } static void free_request_size(void *element, void *data) { struct request_queue *q = data; if (q->exit_rq_fn) q->exit_rq_fn(q, element); kfree(element); } int blk_init_rl(struct request_list *rl, struct request_queue *q, gfp_t gfp_mask) { if (unlikely(rl->rq_pool) || q->mq_ops) return 0; rl->q = q; rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); if (q->cmd_size) { rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_size, free_request_size, q, gfp_mask, q->node); } else { rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, alloc_request_simple, free_request_simple, q, gfp_mask, q->node); } if (!rl->rq_pool) return -ENOMEM; if (rl != &q->root_rl) WARN_ON_ONCE(!blk_get_queue(q)); return 0; } void blk_exit_rl(struct request_queue *q, struct request_list *rl) { if (rl->rq_pool) { mempool_destroy(rl->rq_pool); if (rl != &q->root_rl) blk_put_queue(q); } } struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL); } EXPORT_SYMBOL(blk_alloc_queue); /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PREEMPT; while (true) { bool success = false; rcu_read_lock(); if (percpu_ref_tryget_live(&q->q_usage_counter)) { /* * The code that increments the pm_only counter is * responsible for ensuring that that counter is * globally visible before the queue is unfrozen. */ if (pm || !blk_queue_pm_only(q)) { success = true; } else { percpu_ref_put(&q->q_usage_counter); } } rcu_read_unlock(); if (success) return 0; if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* * read pair of barrier in blk_freeze_queue_start(), * we need to order reading __PERCPU_REF_DEAD flag of * .q_usage_counter and reading .mq_freeze_depth or * queue dying flag, otherwise the following wait may * never return if the two reads are reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (atomic_read(&q->mq_freeze_depth) == 0 && (pm || !blk_queue_pm_only(q))) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; } } void blk_queue_exit(struct request_queue *q) { percpu_ref_put(&q->q_usage_counter); } static void blk_queue_usage_counter_release(struct percpu_ref *ref) { struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); wake_up_all(&q->mq_freeze_wq); } static void blk_rq_timed_out_timer(struct timer_list *t) { struct request_queue *q = from_timer(q, t, timeout); kblockd_schedule_work(&q->timeout_work); } static void blk_timeout_work_dummy(struct work_struct *work) { } /** * blk_alloc_queue_node - allocate a request queue * @gfp_mask: memory allocation flags * @node_id: NUMA node to allocate memory from * @lock: For legacy queues, pointer to a spinlock that will be used to e.g. * serialize calls to the legacy .request_fn() callback. Ignored for * blk-mq request queues. * * Note: pass the queue lock as the third argument to this function instead of * setting the queue lock pointer explicitly to avoid triggering a sporadic * crash in the blkcg code. This function namely calls blkcg_init_queue() and * the queue lock pointer must be set before blkcg_init_queue() is called. */ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, spinlock_t *lock) { struct request_queue *q; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, gfp_mask | __GFP_ZERO, node_id); if (!q) return NULL; INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->end_sector = 0; q->boundary_rq = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) goto fail_q; ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (ret) goto fail_id; q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); if (!q->backing_dev_info) goto fail_split; q->stats = blk_alloc_queue_stats(); if (!q->stats) goto fail_stats; q->backing_dev_info->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; q->backing_dev_info->io_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info->name = "block"; q->node = node_id; timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work_dummy); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); #ifdef CONFIG_BLK_DEV_IO_TRACE mutex_init(&q->blk_trace_mutex); #endif mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->__queue_lock); if (!q->mq_ops) q->queue_lock = lock ? : &q->__queue_lock; /* * A queue starts its life with bypass turned on to avoid * unnecessary bypass on/off overhead and nasty surprises during * init. The initial bypass will be finished when the queue is * registered by blk_register_queue(). */ q->bypass_depth = 1; queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q); init_waitqueue_head(&q->mq_freeze_wq); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; if (blkcg_init_queue(q)) goto fail_ref; return q; fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: blk_free_queue_stats(q->stats); fail_stats: bdi_put(q->backing_dev_info); fail_split: bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } EXPORT_SYMBOL(blk_alloc_queue_node); /** * blk_init_queue - prepare a request queue for use with a block device * @rfn: The function to be called to process requests that have been * placed on the queue. * @lock: Request queue spin lock * * Description: * If a block device wishes to use the standard request handling procedures, * which sorts requests and coalesces adjacent requests, then it must * call blk_init_queue(). The function @rfn will be called when there * are requests on the queue that need to be processed. If the device * supports plugging, then @rfn may not be called immediately when requests * are available on the queue, but may be called at some time later instead. * Plugged queues are generally unplugged when a buffer belonging to one * of the requests on the queue is needed, or due to memory pressure. * * @rfn is not required, or even expected, to remove all requests off the * queue, but only as many as it can handle at a time. If it does leave * requests on the queue, it is responsible for arranging that the requests * get dealt with eventually. * * The queue spin lock must be held while manipulating the requests on the * request queue; this lock will be taken also from interrupt context, so irq * disabling is needed for it. * * Function returns a pointer to the initialized request queue, or %NULL if * it didn't succeed. * * Note: * blk_init_queue() must be paired with a blk_cleanup_queue() call * when the block device is deactivated (such as at module unload). **/ struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) { return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); } EXPORT_SYMBOL(blk_init_queue); struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { struct request_queue *q; q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock); if (!q) return NULL; q->request_fn = rfn; if (blk_init_allocated_queue(q) < 0) { blk_cleanup_queue(q); return NULL; } return q; } EXPORT_SYMBOL(blk_init_queue_node); static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); int blk_init_allocated_queue(struct request_queue *q) { WARN_ON_ONCE(q->mq_ops); q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL); if (!q->fq) return -ENOMEM; if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL)) goto out_free_flush_queue; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto out_exit_flush_rq; INIT_WORK(&q->timeout_work, blk_timeout_work); q->queue_flags |= QUEUE_FLAG_DEFAULT; /* * This also sets hw/phys segments, boundary and size */ blk_queue_make_request(q, blk_queue_bio); q->sg_reserved_size = INT_MAX; if (elevator_init(q)) goto out_exit_flush_rq; return 0; out_exit_flush_rq: if (q->exit_rq_fn) q->exit_rq_fn(q, q->fq->flush_rq); out_free_flush_queue: blk_free_flush_queue(q->fq); q->fq = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_init_allocated_queue); bool blk_get_queue(struct request_queue *q) { if (likely(!blk_queue_dying(q))) { __blk_get_queue(q); return true; } return false; } EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_list *rl, struct request *rq) { if (rq->rq_flags & RQF_ELVPRIV) { elv_put_request(rl->q, rq); if (rq->elv.icq) put_io_context(rq->elv.icq->ioc); } mempool_free(rq, rl->rq_pool); } /* * ioc_batching returns true if the ioc is a valid batching request and * should be given priority access to a request. */ static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) { if (!ioc) return 0; /* * Make sure the process is able to allocate at least 1 request * even if the batch times out, otherwise we could theoretically * lose wakeups. */ return ioc->nr_batch_requests == q->nr_batching || (ioc->nr_batch_requests > 0 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); } /* * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This * will cause the process to be a "batcher" on all queues in the system. This * is the behaviour we want though - once it gets a wakeup it should be given * a nice run. */ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) { if (!ioc || ioc_batching(q, ioc)) return; ioc->nr_batch_requests = q->nr_batching; ioc->last_waited = jiffies; } static void __freed_request(struct request_list *rl, int sync) { struct request_queue *q = rl->q; if (rl->count[sync] < queue_congestion_off_threshold(q)) blk_clear_congested(rl, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); blk_clear_rl_full(rl, sync); } } /* * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ static void freed_request(struct request_list *rl, bool sync, req_flags_t rq_flags) { struct request_queue *q = rl->q; q->nr_rqs[sync]--; rl->count[sync]--; if (rq_flags & RQF_ELVPRIV) q->nr_rqs_elvpriv--; __freed_request(rl, sync); if (unlikely(rl->starved[sync ^ 1])) __freed_request(rl, sync ^ 1); } int blk_update_nr_requests(struct request_queue *q, unsigned int nr) { struct request_list *rl; int on_thresh, off_thresh; WARN_ON_ONCE(q->mq_ops); spin_lock_irq(q->queue_lock); q->nr_requests = nr; blk_queue_congestion_threshold(q); on_thresh = queue_congestion_on_threshold(q); off_thresh = queue_congestion_off_threshold(q); blk_queue_for_each_rl(rl, q) { if (rl->count[BLK_RW_SYNC] >= on_thresh) blk_set_congested(rl, BLK_RW_SYNC); else if (rl->count[BLK_RW_SYNC] < off_thresh) blk_clear_congested(rl, BLK_RW_SYNC); if (rl->count[BLK_RW_ASYNC] >= on_thresh) blk_set_congested(rl, BLK_RW_ASYNC); else if (rl->count[BLK_RW_ASYNC] < off_thresh) blk_clear_congested(rl, BLK_RW_ASYNC); if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_rl_full(rl, BLK_RW_SYNC); } else { blk_clear_rl_full(rl, BLK_RW_SYNC); wake_up(&rl->wait[BLK_RW_SYNC]); } if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { blk_set_rl_full(rl, BLK_RW_ASYNC); } else { blk_clear_rl_full(rl, BLK_RW_ASYNC); wake_up(&rl->wait[BLK_RW_ASYNC]); } } spin_unlock_irq(q->queue_lock); return 0; } /** * __get_request - get a free request * @rl: request list to allocate from * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @flags: BLQ_MQ_REQ_* flags * @gfp_mask: allocator flags * * Get a free request from @q. This function may fail under memory * pressure or if @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *__get_request(struct request_list *rl, unsigned int op, struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask) { struct request_queue *q = rl->q; struct request *rq; struct elevator_type *et = q->elevator->type; struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; const bool is_sync = op_is_sync(op); int may_queue; req_flags_t rq_flags = RQF_ALLOCED; lockdep_assert_held(q->queue_lock); if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); may_queue = elv_may_queue(q, op); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { if (rl->count[is_sync]+1 >= q->nr_requests) { /* * The queue will fill after this allocation, so set * it as full, and mark this process as "batching". * This process will be allowed to complete a batch of * requests, others will be blocked. */ if (!blk_rl_full(rl, is_sync)) { ioc_set_batching(q, ioc); blk_set_rl_full(rl, is_sync); } else { if (may_queue != ELV_MQUEUE_MUST && !ioc_batching(q, ioc)) { /* * The queue is full and the allocating * process is not a "batcher", and not * exempted by the IO scheduler */ return ERR_PTR(-ENOMEM); } } } blk_set_congested(rl, is_sync); } /* * Only allow batching queuers to allocate up to 50% over the defined * limit of requests, otherwise we could have thousands of requests * allocated with any setting of ->nr_requests */ if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) return ERR_PTR(-ENOMEM); q->nr_rqs[is_sync]++; rl->count[is_sync]++; rl->starved[is_sync] = 0; /* * Decide whether the new request will be managed by elevator. If * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will * prevent the current elevator from being destroyed until the new * request is freed. This guarantees icq's won't be destroyed and * makes creating new ones safe. * * Flush requests do not use the elevator so skip initialization. * This allows a request to share the flush and elevator data. * * Also, lookup icq while holding queue_lock. If it doesn't exist, * it will be created after releasing queue_lock. */ if (!op_is_flush(op) && !blk_queue_bypass(q)) { rq_flags |= RQF_ELVPRIV; q->nr_rqs_elvpriv++; if (et->icq_cache && ioc) icq = ioc_lookup_icq(ioc, q); } if (blk_queue_io_stat(q)) rq_flags |= RQF_IO_STAT; spin_unlock_irq(q->queue_lock); /* allocate and init request */ rq = mempool_alloc(rl->rq_pool, gfp_mask); if (!rq) goto fail_alloc; blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); rq->cmd_flags = op; rq->rq_flags = rq_flags; if (flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; /* init elvpriv */ if (rq_flags & RQF_ELVPRIV) { if (unlikely(et->icq_cache && !icq)) { if (ioc) icq = ioc_create_icq(ioc, q, gfp_mask); if (!icq) goto fail_elvpriv; } rq->elv.icq = icq; if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) goto fail_elvpriv; /* @rq->elv.icq holds io_context until @rq is freed */ if (icq) get_io_context(icq->ioc); } out: /* * ioc may be NULL here, and ioc_batching will be false. That's * OK, if the queue is under the request limit then requests need * not count toward the nr_batch_requests limit. There will always * be some limit enforced by BLK_BATCH_TIME. */ if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; trace_block_getrq(q, bio, op); return rq; fail_elvpriv: /* * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed * and may fail indefinitely under memory pressure and thus * shouldn't stall IO. Treat this request as !elvpriv. This will * disturb iosched and blkcg but weird is bettern than dead. */ printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", __func__, dev_name(q->backing_dev_info->dev)); rq->rq_flags &= ~RQF_ELVPRIV; rq->elv.icq = NULL; spin_lock_irq(q->queue_lock); q->nr_rqs_elvpriv--; spin_unlock_irq(q->queue_lock); goto out; fail_alloc: /* * Allocation failed presumably due to memory. Undo anything we * might have messed up. * * Allocating task should really be put onto the front of the wait * queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); freed_request(rl, is_sync, rq_flags); /* * in the very unlikely event that allocation failed and no * requests for this direction was pending, mark us starved so that * freeing of a request in the other direction will notice * us. another possible fix would be to split the rq mempool into * READ and WRITE */ rq_starved: if (unlikely(rl->count[is_sync] == 0)) rl->starved[is_sync] = 1; return ERR_PTR(-ENOMEM); } /** * get_request - get a free request * @q: request_queue to allocate request from * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @flags: BLK_MQ_REQ_* flags. * @gfp: allocator flags * * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags, * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, unsigned int op, struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp) { const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); struct request_list *rl; struct request *rq; lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: rq = __get_request(rl, op, bio, flags, gfp); if (!IS_ERR(rq)) return rq; if (op & REQ_NOWAIT) { blk_put_rl(rl); return ERR_PTR(-EAGAIN); } if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } /* wait on @rl and retry */ prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, TASK_UNINTERRUPTIBLE); trace_block_sleeprq(q, bio, op); spin_unlock_irq(q->queue_lock); io_schedule(); /* * After sleeping, we become a "batching" process and will be able * to allocate at least one request, and up to a big batch of them * for a small period time. See ioc_batching, ioc_set_batching */ ioc_set_batching(q, current->io_context); spin_lock_irq(q->queue_lock); finish_wait(&rl->wait[is_sync], &wait); goto retry; } /* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ static struct request *blk_old_get_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { struct request *rq; gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO; int ret = 0; WARN_ON_ONCE(q->mq_ops); /* create ioc upfront */ create_io_context(gfp_mask, q->node); ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); spin_lock_irq(q->queue_lock); rq = get_request(q, op, NULL, flags, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); blk_queue_exit(q); return rq; } /* q->queue_lock is unlocked at this point */ rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; } /** * blk_get_request - allocate a request * @q: request queue to allocate a request for * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. */ struct request *blk_get_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { struct request *req; WARN_ON_ONCE(op & REQ_NOWAIT); WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); if (q->mq_ops) { req = blk_mq_alloc_request(q, op, flags); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) q->mq_ops->initialize_rq_fn(req); } else { req = blk_old_get_request(q, op, flags); if (!IS_ERR(req) && q->initialize_rq_fn) q->initialize_rq_fn(req); } return req; } EXPORT_SYMBOL(blk_get_request); /** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted * @rq: request to be inserted * * Description: * Drivers often keep queueing requests until the hardware cannot accept * more, when that condition happens we need to put the request back * on the queue. Must be called with queue lock held. */ void blk_requeue_request(struct request_queue *q, struct request *rq) { lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); rq_qos_requeue(q, rq); if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); elv_requeue_request(q, rq); } EXPORT_SYMBOL(blk_requeue_request); static void add_acct_request(struct request_queue *q, struct request *rq, int where) { blk_account_io_start(rq, true); __elv_add_request(q, rq, where); } static void part_round_stats_single(struct request_queue *q, int cpu, struct hd_struct *part, unsigned long now, unsigned int inflight) { if (inflight) { __part_stat_add(cpu, part, time_in_queue, inflight * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; } /** * part_round_stats() - Round off the performance stats on a struct disk_stats. * @q: target block queue * @cpu: cpu number for stats access * @part: target partition * * The average IO queue length and utilisation statistics are maintained * by observing the current state of the queue length and the amount of * time it has been in this state for. * * Normally, that accounting is done on IO completion, but that can result * in more than a second's worth of IO being accounted for within any one * second, leading to >100% utilisation. To deal with that, we call this * function to do a round-off before returning the results when reading * /proc/diskstats. This accounts immediately for all queue usage up to * the current jiffies and restarts the counters again. */ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) { struct hd_struct *part2 = NULL; unsigned long now = jiffies; unsigned int inflight[2]; int stats = 0; if (part->stamp != now) stats |= 1; if (part->partno) { part2 = &part_to_disk(part)->part0; if (part2->stamp != now) stats |= 2; } if (!stats) return; part_in_flight(q, part, inflight); if (stats & 2) part_round_stats_single(q, cpu, part2, now, inflight[1]); if (stats & 1) part_round_stats_single(q, cpu, part, now, inflight[0]); } EXPORT_SYMBOL_GPL(part_round_stats); #ifdef CONFIG_PM static void blk_pm_put_request(struct request *rq) { if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending) pm_runtime_mark_last_busy(rq->q->dev); } #else static inline void blk_pm_put_request(struct request *rq) {} #endif void __blk_put_request(struct request_queue *q, struct request *req) { req_flags_t rq_flags = req->rq_flags; if (unlikely(!q)) return; if (q->mq_ops) { blk_mq_free_request(req); return; } lockdep_assert_held(q->queue_lock); blk_req_zone_write_unlock(req); blk_pm_put_request(req); elv_completed_request(q, req); /* this is a bio leak */ WARN_ON(req->bio != NULL); rq_qos_done(q, req); /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools */ if (rq_flags & RQF_ALLOCED) { struct request_list *rl = blk_rq_rl(req); bool sync = op_is_sync(req->cmd_flags); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(ELV_ON_HASH(req)); blk_free_request(rl, req); freed_request(rl, sync, rq_flags); blk_put_rl(rl); blk_queue_exit(q); } } EXPORT_SYMBOL_GPL(__blk_put_request); void blk_put_request(struct request *req) { struct request_queue *q = req->q; if (q->mq_ops) blk_mq_free_request(req); else { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); __blk_put_request(q, req); spin_unlock_irqrestore(q->queue_lock, flags); } } EXPORT_SYMBOL(blk_put_request); bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio) { const int ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_back_merge_fn(q, req, bio)) return false; trace_block_bio_backmerge(q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); blk_account_io_start(req, false); return true; } bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio) { const int ff = bio->bi_opf & REQ_FAILFAST_MASK; if (!ll_front_merge_fn(q, req, bio)) return false; trace_block_bio_frontmerge(q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); bio->bi_next = req->bio; req->bio = bio; req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); blk_account_io_start(req, false); return true; } bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, struct bio *bio) { unsigned short segments = blk_rq_nr_discard_segments(req); if (segments >= queue_max_discard_segments(q)) goto no_merge; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) goto no_merge; req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); req->nr_phys_segments = segments + 1; blk_account_io_start(req, false); return true; no_merge: req_set_nomerge(q, req); return false; } /** * blk_attempt_plug_merge - try to merge with %current's plugged list * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests * @same_queue_rq: pointer to &struct request that gets filled in when * another request associated with @q is found on the plug list * (optional, may be %NULL) * * Determine whether @bio being queued on @q can be merged with a request * on %current's plugged list. Returns %true if merge was successful, * otherwise %false. * * Plugging coalesces IOs from the same issuer for the same purpose without * going through @q->queue_lock. As such it's more of an issuing mechanism * than scheduling, and the request, while may have elvpriv data, is not * added on the elevator at this point. In addition, we don't have * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. * * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int *request_count, struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; struct list_head *plug_list; plug = current->plug; if (!plug) return false; *request_count = 0; if (q->mq_ops) plug_list = &plug->mq_list; else plug_list = &plug->list; list_for_each_entry_reverse(rq, plug_list, queuelist) { bool merged = false; if (rq->q == q) { (*request_count)++; /* * Only blk-mq multiple hardware queues case checks the * rq in the same queue, there should be only one such * rq in a queue **/ if (same_queue_rq) *same_queue_rq = rq; } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; switch (blk_try_merge(rq, bio)) { case ELEVATOR_BACK_MERGE: merged = bio_attempt_back_merge(q, rq, bio); break; case ELEVATOR_FRONT_MERGE: merged = bio_attempt_front_merge(q, rq, bio); break; case ELEVATOR_DISCARD_MERGE: merged = bio_attempt_discard_merge(q, rq, bio); break; default: break; } if (merged) return true; } return false; } unsigned int blk_plug_queued_count(struct request_queue *q) { struct blk_plug *plug; struct request *rq; struct list_head *plug_list; unsigned int ret = 0; plug = current->plug; if (!plug) goto out; if (q->mq_ops) plug_list = &plug->mq_list; else plug_list = &plug->list; list_for_each_entry(rq, plug_list, queuelist) { if (rq->q == q) ret++; } out: return ret; } void blk_init_request_from_bio(struct request *req, struct bio *bio) { struct io_context *ioc = rq_ioc(bio); if (bio->bi_opf & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; req->__sector = bio->bi_iter.bi_sector; if (ioprio_valid(bio_prio(bio))) req->ioprio = bio_prio(bio); else if (ioc) req->ioprio = ioc->ioprio; else req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); req->write_hint = bio->bi_write_hint; blk_rq_bio_prep(req->q, req, bio); } EXPORT_SYMBOL_GPL(blk_init_request_from_bio); static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { struct blk_plug *plug; int where = ELEVATOR_INSERT_SORT; struct request *req, *free; unsigned int request_count = 0; /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even * ISA dma in theory) */ blk_queue_bounce(q, &bio); blk_queue_split(q, &bio); if (!bio_integrity_prep(bio)) return BLK_QC_T_NONE; if (op_is_flush(bio->bi_opf)) { spin_lock_irq(q->queue_lock); where = ELEVATOR_INSERT_FLUSH; goto get_rq; } /* * Check if we can merge with the plugged list before grabbing * any locks. */ if (!blk_queue_nomerges(q)) { if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) return BLK_QC_T_NONE; } else request_count = blk_plug_queued_count(q); spin_lock_irq(q->queue_lock); switch (elv_merge(q, &req, bio)) { case ELEVATOR_BACK_MERGE: if (!bio_attempt_back_merge(q, req, bio)) break; elv_bio_merged(q, req, bio); free = attempt_back_merge(q, req); if (free) __blk_put_request(q, free); else elv_merged_request(q, req, ELEVATOR_BACK_MERGE); goto out_unlock; case ELEVATOR_FRONT_MERGE: if (!bio_attempt_front_merge(q, req, bio)) break; elv_bio_merged(q, req, bio); free = attempt_front_merge(q, req); if (free) __blk_put_request(q, free); else elv_merged_request(q, req, ELEVATOR_FRONT_MERGE); goto out_unlock; default: break; } get_rq: rq_qos_throttle(q, bio, q->queue_lock); /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ blk_queue_enter_live(q); req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); if (IS_ERR(req)) { blk_queue_exit(q); rq_qos_cleanup(q, bio); if (PTR_ERR(req) == -ENOMEM) bio->bi_status = BLK_STS_RESOURCE; else bio->bi_status = BLK_STS_IOERR; bio_endio(bio); goto out_unlock; } rq_qos_track(q, req, bio); /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). * We don't worry about that case for efficiency. It won't happen * often, and the elevators are able to handle it. */ blk_init_request_from_bio(req, bio); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) req->cpu = raw_smp_processor_id(); plug = current->plug; if (plug) { /* * If this is the first request added after a plug, fire * of a plug trace. * * @request_count may become stale because of schedule * out, so check plug list again. */ if (!request_count || list_empty(&plug->list)) trace_block_plug(q); else { struct request *last = list_entry_rq(plug->list.prev); if (request_count >= BLK_MAX_REQUEST_COUNT || blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) { blk_flush_plug_list(plug, false); trace_block_plug(q); } } list_add_tail(&req->queuelist, &plug->list); blk_account_io_start(req, true); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); __blk_run_queue(q); out_unlock: spin_unlock_irq(q->queue_lock); } return BLK_QC_T_NONE; } static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; printk(KERN_INFO "attempt to access beyond end of device\n"); printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n", bio_devname(bio, b), bio->bi_opf, (unsigned long long)bio_end_sector(bio), (long long)maxsector); } #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); static int __init setup_fail_make_request(char *str) { return setup_fault_attr(&fail_make_request, str); } __setup("fail_make_request=", setup_fail_make_request); static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { return part->make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool should_fail_request(struct hd_struct *part, unsigned int bytes) { return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) { const int op = bio_op(bio); if (part->policy && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return false; WARN_ONCE(1, "generic_make_request: Trying to write " "to read-only block-device %s (partno %d)\n", bio_devname(bio, b), part->partno); /* Older lvm-tools actually trigger this */ return false; } return false; } static noinline int should_fail_bio(struct bio *bio) { if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) return -EIO; return 0; } ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); /* * Check whether this bio extends beyond the end of the device or partition. * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) { unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && maxsector && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { handle_bad_sector(bio, maxsector); return -EIO; } return 0; } /* * Remap block n of partition p to block n+start(p) of the disk. */ static inline int blk_partition_remap(struct bio *bio) { struct hd_struct *p; int ret = -EIO; rcu_read_lock(); p = __disk_get_part(bio->bi_disk, bio->bi_partno); if (unlikely(!p)) goto out; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) goto out; if (unlikely(bio_check_ro(bio, p))) goto out; /* * Zone reset does not include bi_size so bio_sectors() is always 0. * Include a test for the reset op code and perform the remap if needed. */ if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) { if (bio_check_eod(bio, part_nr_sects_read(p))) goto out; bio->bi_iter.bi_sector += p->start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), bio->bi_iter.bi_sector - p->start_sect); } bio->bi_partno = 0; ret = 0; out: rcu_read_unlock(); return ret; } static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { struct request_queue *q; int nr_sectors = bio_sectors(bio); blk_status_t status = BLK_STS_IOERR; char b[BDEVNAME_SIZE]; might_sleep(); q = bio->bi_disk->queue; if (unlikely(!q)) { printk(KERN_ERR "generic_make_request: Trying to access " "nonexistent block-device %s (%Lu)\n", bio_devname(bio, b), (long long)bio->bi_iter.bi_sector); goto end_io; } /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. */ if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; if (should_fail_bio(bio)) goto end_io; if (bio->bi_partno) { if (unlikely(blk_partition_remap(bio))) goto end_io; } else { if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; } /* * Filter flush bio's early so that make_request based * drivers without flush support don't have to worry * about them. */ if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!nr_sectors) { status = BLK_STS_OK; goto end_io; } } switch (bio_op(bio)) { case REQ_OP_DISCARD: if (!blk_queue_discard(q)) goto not_supported; break; case REQ_OP_SECURE_ERASE: if (!blk_queue_secure_erase(q)) goto not_supported; break; case REQ_OP_WRITE_SAME: if (!q->limits.max_write_same_sectors) goto not_supported; break; case REQ_OP_ZONE_REPORT: case REQ_OP_ZONE_RESET: if (!blk_queue_is_zoned(q)) goto not_supported; break; case REQ_OP_WRITE_ZEROES: if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; default: break; } /* * Various block parts want %current->io_context and lazy ioc * allocation ends up trading a lot of pain for a small amount of * memory. Just allocate it upfront. This may fail and block * layer knows how to live with it. */ create_io_context(GFP_ATOMIC, q->node); if (!blkcg_bio_issue_check(q, bio)) return false; if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(q, bio); /* Now that enqueuing has been traced, we need to trace * completion as well. */ bio_set_flag(bio, BIO_TRACE_COMPLETION); } return true; not_supported: status = BLK_STS_NOTSUPP; end_io: bio->bi_status = status; bio_endio(bio); return false; } /** * generic_make_request - hand a buffer to its device driver for I/O * @bio: The bio describing the location in memory and on the device. * * generic_make_request() is used to make I/O requests of block * devices. It is passed a &struct bio, which describes the I/O that needs * to be done. * * generic_make_request() does not return any status. The * success/failure status of the request, along with notification of * completion, is delivered asynchronously through the bio->bi_end_io * function described (one day) else where. * * The caller of generic_make_request must make sure that bi_io_vec * are set to describe the memory buffer, and that bi_dev and bi_sector are * set to describe the device address, and the * bi_end_io and optionally bi_private are set to describe how * completion notification should be signaled. * * generic_make_request and the drivers it calls may use bi_next if this * bio happens to be merged with someone else, and may resubmit the bio to * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. */ blk_qc_t generic_make_request(struct bio *bio) { /* * bio_list_on_stack[0] contains bios submitted by the current * make_request_fn. * bio_list_on_stack[1] contains bios that were submitted before * the current make_request_fn, but that haven't been processed * yet. */ struct bio_list bio_list_on_stack[2]; blk_mq_req_flags_t flags = 0; struct request_queue *q = bio->bi_disk->queue; blk_qc_t ret = BLK_QC_T_NONE; if (bio->bi_opf & REQ_NOWAIT) flags = BLK_MQ_REQ_NOWAIT; if (bio_flagged(bio, BIO_QUEUE_ENTERED)) blk_queue_enter_live(q); else if (blk_queue_enter(q, flags) < 0) { if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT)) bio_wouldblock_error(bio); else bio_io_error(bio); return ret; } if (!generic_make_request_checks(bio)) goto out; /* * We only want one ->make_request_fn to be active at a time, else * stack usage with stacked devices could be a problem. So use * current->bio_list to keep a list of requests submited by a * make_request_fn function. current->bio_list is also used as a * flag to say if generic_make_request is currently active in this * task or not. If it is NULL, then no make_request is active. If * it is non-NULL, then a make_request is active, and new requests * should be added at the tail */ if (current->bio_list) { bio_list_add(¤t->bio_list[0], bio); goto out; } /* following loop may be a bit non-obvious, and so deserves some * explanation. * Before entering the loop, bio->bi_next is NULL (as all callers * ensure that) so we have a list with a single bio. * We pretend that we have just taken it off a longer list, so * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be * added. ->make_request() may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from * bio_list, and call into ->make_request() again. */ BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { bool enter_succeeded = true; if (unlikely(q != bio->bi_disk->queue)) { if (q) blk_queue_exit(q); q = bio->bi_disk->queue; flags = 0; if (bio->bi_opf & REQ_NOWAIT) flags = BLK_MQ_REQ_NOWAIT; if (blk_queue_enter(q, flags) < 0) enter_succeeded = false; } if (enter_succeeded) { struct bio_list lower, same; /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); ret = q->make_request_fn(q, bio); /* sort new bios into those for a lower level * and those for the same level */ bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) if (q == bio->bi_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); /* now assemble so we handle the lowest level first */ bio_list_merge(&bio_list_on_stack[0], &lower); bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } else { if (unlikely(!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))) bio_wouldblock_error(bio); else bio_io_error(bio); q = NULL; } bio = bio_list_pop(&bio_list_on_stack[0]); } while (bio); current->bio_list = NULL; /* deactivate */ out: if (q) blk_queue_exit(q); return ret; } EXPORT_SYMBOL(generic_make_request); /** * direct_make_request - hand a buffer directly to its device driver for I/O * @bio: The bio describing the location in memory and on the device. * * This function behaves like generic_make_request(), but does not protect * against recursion. Must only be used if the called driver is known * to not call generic_make_request (or direct_make_request) again from * its make_request function. (Calling direct_make_request again from * a workqueue is perfectly fine as that doesn't recurse). */ blk_qc_t direct_make_request(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; bool nowait = bio->bi_opf & REQ_NOWAIT; blk_qc_t ret; if (!generic_make_request_checks(bio)) return BLK_QC_T_NONE; if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) { if (nowait && !blk_queue_dying(q)) bio->bi_status = BLK_STS_AGAIN; else bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return BLK_QC_T_NONE; } ret = q->make_request_fn(q, bio); blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(direct_make_request); /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * * submit_bio() is very similar in purpose to generic_make_request(), and * uses that function to do most of the work. Both are fairly rough * interfaces; @bio must be presetup and ready for I/O. * */ blk_qc_t submit_bio(struct bio *bio) { /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) count = queue_logical_block_size(bio->bi_disk->queue) >> 9; else count = bio_sectors(bio); if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); } else { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), op_is_write(bio_op(bio)) ? "WRITE" : "READ", (unsigned long long)bio->bi_iter.bi_sector, bio_devname(bio, b), count); } } return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); bool blk_poll(struct request_queue *q, blk_qc_t cookie) { if (!q->poll_fn || !blk_qc_t_valid(cookie)) return false; if (current->plug) blk_flush_plug_list(current->plug, false); return q->poll_fn(q, cookie); } EXPORT_SYMBOL_GPL(blk_poll); /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for new the queue limits * @q: the queue * @rq: the request being checked * * Description: * @rq may have been made based on weaker limitations of upper-level queues * in request stacking drivers, and it may violate the limitation of @q. * Since the block layer and the underlying device driver trust @rq * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * * Request stacking drivers like request-based dm may change the queue * limits when retrying requests on other queues. Those requests need * to be checked against the new queue limits again during dispatch. */ static int blk_cloned_rq_check_limits(struct request_queue *q, struct request *rq) { if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; } /* * queue's settings related to segment counting like q->bounce_pfn * may differ from that of other stacking queues. * Recalculate it to check the request correctly on this queue's * limitation. */ blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit.\n", __func__); return -EIO; } return 0; } /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @q: the queue to submit the request * @rq: the request being queued */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned long flags; int where = ELEVATOR_INSERT_BACK; if (blk_cloned_rq_check_limits(q, rq)) return BLK_STS_IOERR; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); /* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert. */ return blk_mq_request_issue_directly(rq); } spin_lock_irqsave(q->queue_lock, flags); if (unlikely(blk_queue_dying(q))) { spin_unlock_irqrestore(q->queue_lock, flags); return BLK_STS_IOERR; } /* * Submitting request must be dequeued before calling this function * because it will be linked to another request_queue */ BUG_ON(blk_queued_rq(rq)); if (op_is_flush(rq->cmd_flags)) where = ELEVATOR_INSERT_FLUSH; add_acct_request(q, rq, where); if (where == ELEVATOR_INSERT_FLUSH) __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); return BLK_STS_OK; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_err_bytes - determine number of bytes till the next failure boundary * @rq: request to examine * * Description: * A request could be merge of IOs which require different failure * handling. This function determines the number of bytes which * can be failed from the beginning of the request without * crossing into area which need to be retried further. * * Return: * The number of bytes to fail. */ unsigned int blk_rq_err_bytes(const struct request *rq) { unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; unsigned int bytes = 0; struct bio *bio; if (!(rq->rq_flags & RQF_MIXED_MERGE)) return blk_rq_bytes(rq); /* * Currently the only 'mixing' which can happen is between * different fastfail types. We can safely fail portions * which have all the failfast bits that the first one has - * the ones which are at least as eager to fail as the first * one. */ for (bio = rq->bio; bio; bio = bio->bi_next) { if ((bio->bi_opf & ff) != ff) break; bytes += bio->bi_iter.bi_size; } /* this could lead to infinite loop */ BUG_ON(blk_rq_bytes(rq) && !bytes); return bytes; } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; cpu = part_stat_lock(); part = req->part; part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } void blk_account_io_done(struct request *req, u64 now) { /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; cpu = part_stat_lock(); part = req->part; part_stat_inc(cpu, part, ios[sgrp]); part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); part_stat_unlock(); } } #ifdef CONFIG_PM /* * Don't process normal requests when queue is suspended * or in the process of suspending/resuming */ static bool blk_pm_allow_request(struct request *rq) { switch (rq->q->rpm_status) { case RPM_RESUMING: case RPM_SUSPENDING: return rq->rq_flags & RQF_PM; case RPM_SUSPENDED: return false; default: return true; } } #else static bool blk_pm_allow_request(struct request *rq) { return true; } #endif void blk_account_io_start(struct request *rq, bool new_io) { struct hd_struct *part; int rw = rq_data_dir(rq); int cpu; if (!blk_do_io_stat(rq)) return; cpu = part_stat_lock(); if (!new_io) { part = rq->part; part_stat_inc(cpu, part, merges[rw]); } else { part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); if (!hd_struct_try_get(part)) { /* * The partition is already being removed, * the request will be accounted on the disk only * * We take a reference on disk->part0 although that * partition will never be deleted, so we can treat * it as any other partition. */ part = &rq->rq_disk->part0; hd_struct_get(part); } part_round_stats(rq->q, cpu, part); part_inc_in_flight(rq->q, part, rw); rq->part = part; } part_stat_unlock(); } static struct request *elv_next_request(struct request_queue *q) { struct request *rq; struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); WARN_ON_ONCE(q->mq_ops); while (1) { list_for_each_entry(rq, &q->queue_head, queuelist) { if (blk_pm_allow_request(rq)) return rq; if (rq->rq_flags & RQF_SOFTBARRIER) break; } /* * Flush request is running and flush request isn't queueable * in the drive, we can hold the queue till flush request is * finished. Even we don't do this, driver can't dispatch next * requests and will requeue them. And this can improve * throughput too. For example, we have request flush1, write1, * flush 2. flush1 is dispatched, then queue is hold, write1 * isn't inserted to queue. After flush1 is finished, flush2 * will be dispatched. Since disk cache is already clean, * flush2 will be finished very soon, so looks like flush2 is * folded to flush1. * Since the queue is hold, a flag is set to indicate the queue * should be restarted later. Please see flush_end_io() for * details. */ if (fq->flush_pending_idx != fq->flush_running_idx && !queue_flush_queueable(q)) { fq->flush_queue_delayed = 1; return NULL; } if (unlikely(blk_queue_bypass(q)) || !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) return NULL; } } /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at * * Description: * Return the request at the top of @q. The returned request * should be started using blk_start_request() before LLD starts * processing it. * * Return: * Pointer to the request at the top of @q if available. Null * otherwise. */ struct request *blk_peek_request(struct request_queue *q) { struct request *rq; int ret; lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); while ((rq = elv_next_request(q)) != NULL) { if (!(rq->rq_flags & RQF_STARTED)) { /* * This is the first time the device driver * sees this request (possibly after * requeueing). Notify IO scheduler. */ if (rq->rq_flags & RQF_SORTED) elv_activate_rq(q, rq); /* * just mark as started even if we don't start * it, a request that has been delayed should * not be passed by new incoming requests */ rq->rq_flags |= RQF_STARTED; trace_block_rq_issue(q, rq); } if (!q->boundary_rq || q->boundary_rq == rq) { q->end_sector = rq_end_sector(rq); q->boundary_rq = NULL; } if (rq->rq_flags & RQF_DONTPREP) break; if (q->dma_drain_size && blk_rq_bytes(rq)) { /* * make sure space for the drain appears we * know we can do this because max_hw_segments * has been adjusted to be one fewer than the * device can handle */ rq->nr_phys_segments++; } if (!q->prep_rq_fn) break; ret = q->prep_rq_fn(q, rq); if (ret == BLKPREP_OK) { break; } else if (ret == BLKPREP_DEFER) { /* * the request may have been (partially) prepped. * we need to keep this request in the front to * avoid resource deadlock. RQF_STARTED will * prevent other fs requests from passing this one. */ if (q->dma_drain_size && blk_rq_bytes(rq) && !(rq->rq_flags & RQF_DONTPREP)) { /* * remove the space for the drain we added * so that we don't add it again */ --rq->nr_phys_segments; } rq = NULL; break; } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { rq->rq_flags |= RQF_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. */ blk_start_request(rq); __blk_end_request_all(rq, ret == BLKPREP_INVALID ? BLK_STS_TARGET : BLK_STS_IOERR); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; } } return rq; } EXPORT_SYMBOL(blk_peek_request); static void blk_dequeue_request(struct request *rq) { struct request_queue *q = rq->q; BUG_ON(list_empty(&rq->queuelist)); BUG_ON(ELV_ON_HASH(rq)); list_del_init(&rq->queuelist); /* * the time frame between a request being removed from the lists * and to it is freed is accounted as io that is in progress at * the driver side. */ if (blk_account_rq(rq)) q->in_flight[rq_is_sync(rq)]++; } /** * blk_start_request - start request processing on the driver * @req: request to dequeue * * Description: * Dequeue @req and start timeout timer on it. This hands off the * request to the driver. */ void blk_start_request(struct request *req) { lockdep_assert_held(req->q->queue_lock); WARN_ON_ONCE(req->q->mq_ops); blk_dequeue_request(req); if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { req->io_start_time_ns = ktime_get_ns(); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW req->throtl_size = blk_rq_sectors(req); #endif req->rq_flags |= RQF_STATS; rq_qos_issue(req->q, req); } BUG_ON(blk_rq_is_complete(req)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); /** * blk_fetch_request - fetch a request from a request queue * @q: request queue to fetch a request from * * Description: * Return the request at the top of @q. The request is started on * return and LLD can start processing it immediately. * * Return: * Pointer to the request at the top of @q if available. Null * otherwise. */ struct request *blk_fetch_request(struct request_queue *q) { struct request *rq; lockdep_assert_held(q->queue_lock); WARN_ON_ONCE(q->mq_ops); rq = blk_peek_request(q); if (rq) blk_start_request(rq); return rq; } EXPORT_SYMBOL(blk_fetch_request); /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. */ void blk_steal_bios(struct bio_list *list, struct request *rq) { if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; else list->head = rq->bio; list->tail = rq->biotail; rq->bio = NULL; rq->biotail = NULL; } rq->__data_len = 0; } EXPORT_SYMBOL_GPL(blk_steal_bios); /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * This special helper function is only for request stacking drivers * (e.g. request-based dm) so that they can handle partial completion. * Actual device drivers should use blk_end_request instead. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both * blk_rq_bytes() and in blk_update_request(). * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { int total_bytes; trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); if (!req->bio) return false; if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) print_req_error(req, error); blk_account_io_completion(req, nr_bytes); total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); req_bio_endio(req, bio, bio_bytes, error); total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ blk_recalc_rq_segments(req); } return true; } EXPORT_SYMBOL_GPL(blk_update_request); static bool blk_update_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_request(rq, error, nr_bytes)) return true; /* Bidi request must be completed as a whole */ if (unlikely(blk_bidi_rq(rq)) && blk_update_request(rq->next_rq, error, bidi_bytes)) return true; if (blk_queue_add_random(rq->q)) add_disk_randomness(rq->rq_disk); return false; } /** * blk_unprep_request - unprepare a request * @req: the request * * This function makes a request ready for complete resubmission (or * completion). It happens only after all error handling is complete, * so represents the appropriate moment to deallocate any resources * that were allocated to the request in the prep_rq_fn. The queue * lock is held when calling this. */ void blk_unprep_request(struct request *req) { struct request_queue *q = req->q; req->rq_flags &= ~RQF_DONTPREP; if (q->unprep_rq_fn) q->unprep_rq_fn(q, req); } EXPORT_SYMBOL_GPL(blk_unprep_request); void blk_finish_request(struct request *req, blk_status_t error) { struct request_queue *q = req->q; u64 now = ktime_get_ns(); lockdep_assert_held(req->q->queue_lock); WARN_ON_ONCE(q->mq_ops); if (req->rq_flags & RQF_STATS) blk_stat_add(req, now); if (req->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, req); BUG_ON(blk_queued_rq(req)); if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req)) laptop_io_completion(req->q->backing_dev_info); blk_delete_timer(req); if (req->rq_flags & RQF_DONTPREP) blk_unprep_request(req); blk_account_io_done(req, now); if (req->end_io) { rq_qos_done(q, req); req->end_io(req, error); } else { if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); __blk_put_request(q, req); } } EXPORT_SYMBOL(blk_finish_request); /** * blk_end_bidi_request - Complete a bidi request * @rq: the request to complete * @error: block status code * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * * Description: * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. * Drivers that supports bidi can safely call this member for any * type of request, bidi or uni. In the later case @bidi_bytes is * just ignored. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ static bool blk_end_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { struct request_queue *q = rq->q; unsigned long flags; WARN_ON_ONCE(q->mq_ops); if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; spin_lock_irqsave(q->queue_lock, flags); blk_finish_request(rq, error); spin_unlock_irqrestore(q->queue_lock, flags); return false; } /** * __blk_end_bidi_request - Complete a bidi request with queue lock held * @rq: the request to complete * @error: block status code * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * * Description: * Identical to blk_end_bidi_request() except that queue lock is * assumed to be locked on entry and remains so on return. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ static bool __blk_end_bidi_request(struct request *rq, blk_status_t error, unsigned int nr_bytes, unsigned int bidi_bytes) { lockdep_assert_held(rq->q->queue_lock); WARN_ON_ONCE(rq->q->mq_ops); if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; blk_finish_request(rq, error); return false; } /** * blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete * * Description: * Ends I/O on a number of bytes attached to @rq. * If @rq has leftover, sets it up for the next range of segments. * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ bool blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes) { WARN_ON_ONCE(rq->q->mq_ops); return blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(blk_end_request); /** * blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish * @error: block status code * * Description: * Completely finish @rq. */ void blk_end_request_all(struct request *rq, blk_status_t error) { bool pending; unsigned int bidi_bytes = 0; if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } EXPORT_SYMBOL(blk_end_request_all); /** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete * * Description: * Must be called with queue lock held unlike blk_end_request(). * * Return: * %false - we are done with this request * %true - still buffers pending for this request **/ bool __blk_end_request(struct request *rq, blk_status_t error, unsigned int nr_bytes) { lockdep_assert_held(rq->q->queue_lock); WARN_ON_ONCE(rq->q->mq_ops); return __blk_end_bidi_request(rq, error, nr_bytes, 0); } EXPORT_SYMBOL(__blk_end_request); /** * __blk_end_request_all - Helper function for drives to finish the request. * @rq: the request to finish * @error: block status code * * Description: * Completely finish @rq. Must be called with queue lock held. */ void __blk_end_request_all(struct request *rq, blk_status_t error) { bool pending; unsigned int bidi_bytes = 0; lockdep_assert_held(rq->q->queue_lock); WARN_ON_ONCE(rq->q->mq_ops); if (unlikely(blk_bidi_rq(rq))) bidi_bytes = blk_rq_bytes(rq->next_rq); pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } EXPORT_SYMBOL(__blk_end_request_all); /** * __blk_end_request_cur - Helper function to finish the current request chunk. * @rq: the request to finish the current chunk for * @error: block status code * * Description: * Complete the current consecutively mapped chunk from @rq. Must * be called with queue lock held. * * Return: * %false - we are done with this request * %true - still buffers pending for this request */ bool __blk_end_request_cur(struct request *rq, blk_status_t error) { return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } EXPORT_SYMBOL(__blk_end_request_cur); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); else if (bio_op(bio) == REQ_OP_DISCARD) rq->nr_phys_segments = 1; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; if (bio->bi_disk) rq->rq_disk = bio->bi_disk; } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE /** * rq_flush_dcache_pages - Helper function to flush all pages in a request * @rq: the request to be flushed * * Description: * Flush all pages in @rq. */ void rq_flush_dcache_pages(struct request *rq) { struct req_iterator iter; struct bio_vec bvec; rq_for_each_segment(bvec, rq, iter) flush_dcache_page(bvec.bv_page); } EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); #endif /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked * * Description: * Check if underlying low-level drivers of a device are busy. * If the drivers want to export their busy state, they must set own * exporting function using blk_queue_lld_busy() first. * * Basically, this function is used only by request stacking drivers * to stop dispatching requests to underlying devices when underlying * devices are busy. This behavior helps more I/O merging on the queue * of the request stacking driver and prevents I/O throughput regression * on burst I/O load. * * Return: * 0 - Not busy (The request stacking driver should dispatch request) * 1 - Busy (The request stacking driver should stop dispatching request) */ int blk_lld_busy(struct request_queue *q) { if (q->lld_busy_fn) return q->lld_busy_fn(q); return 0; } EXPORT_SYMBOL_GPL(blk_lld_busy); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /* * Copy attributes of the original request to the clone request. * The actual data parts (e.g. ->cmd, ->sense) are not copied. */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { dst->rq_flags |= RQF_SPECIAL_PAYLOAD; dst->special_vec = src->special_vec; } dst->nr_phys_segments = src->nr_phys_segments; dst->ioprio = src->ioprio; dst->extra_len = src->extra_len; } /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * The actual data parts of @rq_src (e.g. ->cmd, ->sense) * are not copied, and copying such parts is the caller's responsibility. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio, *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else rq->bio = rq->biotail = bio; } __blk_rq_prep_clone(rq, rq_src); return 0; free_and_out: if (bio) bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); int kblockd_schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work_on); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized * * Description: * Tracking blk_plug inside the task_struct will help with auto-flushing the * pending I/O should the task end up blocking between blk_start_plug() and * blk_finish_plug(). This is important from a performance perspective, but * also ensures that we don't deadlock. For instance, if the task is blocking * for a memory allocation, memory reclaim could end up wanting to free a * page belonging to that request that is currently residing in our private * plug. By flushing the pending I/O when the process goes to sleep, we avoid * this kind of deadlock. */ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; /* * If this is a nested plug, don't actually assign it. */ if (tsk->plug) return; INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier */ tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); return !(rqa->q < rqb->q || (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb))); } /* * If 'from_schedule' is true, then postpone the dispatch of requests * until a safe kblockd context. We due this to avoid accidental big * additional stack usage in driver dispatch, in places where the originally * plugger did not intend it. */ static void queue_unplugged(struct request_queue *q, unsigned int depth, bool from_schedule) __releases(q->queue_lock) { lockdep_assert_held(q->queue_lock); trace_block_unplug(q, depth, !from_schedule); if (from_schedule) blk_run_queue_async(q); else __blk_run_queue(q); spin_unlock_irq(q->queue_lock); } static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); while (!list_empty(&plug->cb_list)) { list_splice_init(&plug->cb_list, &callbacks); while (!list_empty(&callbacks)) { struct blk_plug_cb *cb = list_first_entry(&callbacks, struct blk_plug_cb, list); list_del(&cb->list); cb->callback(cb, from_schedule); } } } struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size) { struct blk_plug *plug = current->plug; struct blk_plug_cb *cb; if (!plug) return NULL; list_for_each_entry(cb, &plug->cb_list, list) if (cb->callback == unplug && cb->data == data) return cb; /* Not currently on the callback list */ BUG_ON(size < sizeof(*cb)); cb = kzalloc(size, GFP_ATOMIC); if (cb) { cb->data = data; cb->callback = unplug; list_add(&cb->list, &plug->cb_list); } return cb; } EXPORT_SYMBOL(blk_check_plugged); void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; struct request *rq; LIST_HEAD(list); unsigned int depth; flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); if (list_empty(&plug->list)) return; list_splice_init(&plug->list, &list); list_sort(NULL, &list, plug_rq_cmp); q = NULL; depth = 0; while (!list_empty(&list)) { rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); BUG_ON(!rq->q); if (rq->q != q) { /* * This drops the queue lock */ if (q) queue_unplugged(q, depth, from_schedule); q = rq->q; depth = 0; spin_lock_irq(q->queue_lock); } /* * Short-circuit if @q is dead */ if (unlikely(blk_queue_dying(q))) { __blk_end_request_all(rq, BLK_STS_IOERR); continue; } /* * rq is already accounted, so use raw insert */ if (op_is_flush(rq->cmd_flags)) __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); else __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); depth++; } /* * This drops the queue lock */ if (q) queue_unplugged(q, depth, from_schedule); } void blk_finish_plug(struct blk_plug *plug) { if (plug != current->plug) return; blk_flush_plug_list(plug, false); current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine * @q: the queue of the device * @dev: the device the queue belongs to * * Description: * Initialize runtime-PM-related fields for @q and start auto suspend for * @dev. Drivers that want to take advantage of request-based runtime PM * should call this function after @dev has been initialized, and its * request queue @q has been allocated, and runtime PM for it can not happen * yet(either due to disabled/forbidden or its usage_count > 0). In most * cases, driver should call this function before any I/O has taken place. * * This function takes care of setting up using auto suspend for the device, * the autosuspend delay is set to -1 to make runtime suspend impossible * until an updated value is either set by user or by driver. Drivers do * not need to touch other autosuspend settings. * * The block layer runtime PM is request based, so only works for drivers * that use request as their IO unit instead of those directly use bio's. */ void blk_pm_runtime_init(struct request_queue *q, struct device *dev) { /* Don't enable runtime PM for blk-mq until it is ready */ if (q->mq_ops) { pm_runtime_disable(dev); return; } q->dev = dev; q->rpm_status = RPM_ACTIVE; pm_runtime_set_autosuspend_delay(q->dev, -1); pm_runtime_use_autosuspend(q->dev); } EXPORT_SYMBOL(blk_pm_runtime_init); /** * blk_pre_runtime_suspend - Pre runtime suspend check * @q: the queue of the device * * Description: * This function will check if runtime suspend is allowed for the device * by examining if there are any requests pending in the queue. If there * are requests pending, the device can not be runtime suspended; otherwise, * the queue's status will be updated to SUSPENDING and the driver can * proceed to suspend the device. * * For the not allowed case, we mark last busy for the device so that * runtime PM core will try to autosuspend it some time later. * * This function should be called near the start of the device's * runtime_suspend callback. * * Return: * 0 - OK to runtime suspend the device * -EBUSY - Device should not be runtime suspended */ int blk_pre_runtime_suspend(struct request_queue *q) { int ret = 0; if (!q->dev) return ret; spin_lock_irq(q->queue_lock); if (q->nr_pending) { ret = -EBUSY; pm_runtime_mark_last_busy(q->dev); } else { q->rpm_status = RPM_SUSPENDING; } spin_unlock_irq(q->queue_lock); return ret; } EXPORT_SYMBOL(blk_pre_runtime_suspend); /** * blk_post_runtime_suspend - Post runtime suspend processing * @q: the queue of the device * @err: return value of the device's runtime_suspend function * * Description: * Update the queue's runtime status according to the return value of the * device's runtime suspend function and mark last busy for the device so * that PM core will try to auto suspend the device at a later time. * * This function should be called near the end of the device's * runtime_suspend callback. */ void blk_post_runtime_suspend(struct request_queue *q, int err) { if (!q->dev) return; spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; } else { q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); } spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL(blk_post_runtime_suspend); /** * blk_pre_runtime_resume - Pre runtime resume processing * @q: the queue of the device * * Description: * Update the queue's runtime status to RESUMING in preparation for the * runtime resume of the device. * * This function should be called near the start of the device's * runtime_resume callback. */ void blk_pre_runtime_resume(struct request_queue *q) { if (!q->dev) return; spin_lock_irq(q->queue_lock); q->rpm_status = RPM_RESUMING; spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL(blk_pre_runtime_resume); /** * blk_post_runtime_resume - Post runtime resume processing * @q: the queue of the device * @err: return value of the device's runtime_resume function * * Description: * Update the queue's runtime status according to the return value of the * device's runtime_resume function. If it is successfully resumed, process * the requests that are queued into the device's queue when it is resuming * and then mark last busy and initiate autosuspend for it. * * This function should be called near the end of the device's * runtime_resume callback. */ void blk_post_runtime_resume(struct request_queue *q, int err) { if (!q->dev) return; spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; __blk_run_queue(q); pm_runtime_mark_last_busy(q->dev); pm_request_autosuspend(q->dev); } else { q->rpm_status = RPM_SUSPENDED; } spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL(blk_post_runtime_resume); /** * blk_set_runtime_active - Force runtime status of the queue to be active * @q: the queue of the device * * If the device is left runtime suspended during system suspend the resume * hook typically resumes the device and corrects runtime status * accordingly. However, that does not affect the queue runtime PM status * which is still "suspended". This prevents processing requests from the * queue. * * This function can be used in driver's resume hook to correct queue * runtime PM status and re-enable peeking requests from the queue. It * should be called before first request is added to the queue. */ void blk_set_runtime_active(struct request_queue *q) { spin_lock_irq(q->queue_lock); q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); pm_request_autosuspend(q->dev); spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL(blk_set_runtime_active); #endif int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * FIELD_SIZEOF(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * FIELD_SIZEOF(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), 0, SLAB_PANIC, NULL); blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); #ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); #endif return 0; } |
1681 3612 5415 1533 1533 1533 1533 542 541 2055 2084 1989 2082 1734 212 175 43 17 46 161 161 10 2354 1225 1594 2312 3322 1524 1700 142 2732 2736 521 2899 2900 107 15 9 18 18 51 1785 1785 1785 1783 1782 1783 1778 1128 1127 1111 1109 1109 1108 1017 1018 1018 1108 1125 1114 3 1128 1114 1127 1749 1750 1744 1741 703 684 33 33 7 27 4 4 19 19 19 19 19 19 10 10 6 10 6 10 10 6 10 2162 2217 400 295 400 589 589 578 580 223 571 461 136 552 206 206 202 752 753 249 751 246 194 237 740 76 76 677 675 677 15 162 542 113 84 84 36 47 47 47 31 47 2 2410 2454 2021 143 41 32 13 7 150 3 2311 270 268 98 136 265 5 1167 1361 1321 2355 667 152 577 2309 935 2308 2308 2352 2354 2353 2310 4 3 2 3 4 471 471 471 471 29 29 29 12 12 12 12 29 29 29 29 24 120 104 24 24 23 87 120 120 104 24 24 104 120 965 1084 1083 477 469 143 129 132 27 3 32 31 9 4 32 32 10651 28 205 205 205 205 202 205 204 18 203 8 10644 10641 156 10648 742 453 451 452 246 14 10642 451 10631 5 52 10619 742 9678 2349 10582 10592 10582 10570 17 1438 1420 1104 10635 10637 89 85 10540 1438 1439 10599 10634 33 33 737 736 718 718 706 384 383 93 384 123 383 1086 1086 502 499 19 629 13 13 615 616 1077 411 14 10801 10808 10792 396 4973 5153 5150 3 3 24 3 3 2 3 3 3 10801 10799 10793 81 78 3 10793 32 10795 10797 10799 10621 10798 10066 10067 10067 10051 10039 4 4 10784 10788 36 30 29 10 28 2 30 2218 2274 2217 2219 3 3 3 2225 2220 2221 2219 2220 5 5 5 2223 4 4 4 4 4 4 4 4 4 4 172 4 4 4 4 2226 2225 2222 1851 445 407 446 446 881 880 879 279 279 257 255 242 257 147 1370 1372 1372 1371 168 160 168 167 1374 92 1371 83 1372 3 1369 1372 91 91 90 1373 1370 35 2 34 1362 90 38 89 101 1362 1365 1359 1362 1330 65 65 65 1373 1371 1332 1373 1374 1370 571 569 15 555 2 15 571 1375 1371 1366 1352 542 542 542 542 19 19 18 18 17 18 17 18 18 19 18 4 9 27 27 27 19 19 18 19 2 3 62 62 61 61 536 536 536 536 185 185 199 199 199 199 199 398 275 180 393 393 497 497 2109 2107 724 437 1903 497 497 79 1611 1611 51 1611 391 363 395 117 181 391 18 18 391 18 395 181 181 181 381 394 117 219 147 219 311 311 258 311 397 324 397 391 207 369 391 207 207 207 391 390 207 207 362 391 369 391 391 8 8 2226 555 2226 2096 363 364 173 397 395 363 364 363 111 111 362 174 174 174 46 173 173 173 362 368 368 368 367 368 278 112 368 358 364 361 279 161 174 174 19 19 19 19 19 19 362 362 263 362 138 138 530 167 229 229 60 60 229 220 205 98 105 98 305 305 113 113 304 121 305 139 1067 1874 1023 22 943 19 1065 745 2349 1282 1241 2349 924 924 452 389 918 141 141 122 902 210 210 902 1320 994 1319 395 1228 608 229 1219 517 505 428 2 145 1 1 227 15 213 3 33 458 455 172 57 9 366 37 22 37 21 29 6 170 204 200 199 203 172 5 5 2216 2215 2216 2215 1 1 2216 529 530 17 542 541 73 30 30 30 29 25 15 1 29 18 18 17 16 30 1 1539 13 1537 181 617 617 617 616 617 617 617 616 541 542 541 468 434 79 434 418 418 497 497 432 310 310 51 51 47 5 51 118 118 45 3 1610 1168 1609 129 1610 48 1611 49 1610 48 1610 1 1611 405 1608 1611 1213 1610 1611 1611 861 1611 51 1610 684 1611 118 118 683 684 17 682 684 1611 139 139 108 276 253 29 2036 2037 2037 807 807 806 2037 118 1733 1734 1732 1712 1502 1 1712 191 191 1712 1644 1644 1495 1644 1222 1645 667 1644 113 1645 177 1645 1641 1531 556 1405 1164 584 1472 80 49 80 59 372 370 250 405 405 403 2 2 2 407 7137 405 405 404 403 404 404 288 386 383 383 2251 376 284 174 2250 42 69 69 31 31 791 2037 2037 2037 2035 2037 2037 2037 692 2036 2037 949 2036 807 806 14 803 402 402 1246 1199 127 597 597 308 290 308 308 21 5 5 4 5 273 231 272 231 273 684 684 684 684 684 684 246 30 216 30 84 28 140 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 | /* * NET3 Protocol independent device support routines. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Derived from the non IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Florian la Roche <rzsfl@rz.uni-sb.de> * Alan Cox <gw4pts@gw4pts.ampr.org> * David Hinds <dahinds@users.sourceforge.net> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Adam Sulmicki <adam@cfar.umd.edu> * Pekka Riikonen <priikone@poesidon.pspt.fi> * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set * to 2 if register_netdev gets called * before net_dev_init & also removed a * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. * Alan Cox : Fixed double lock. * Alan Cox : Fixed promisc NULL pointer trap * ???????? : Support the full private ioctl range * Alan Cox : Moved ioctl permission check into * drivers * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before * calling netif_rx. Saves a function * call a packet. * Alan Cox : Hashed net_bh() * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() * Dave Miller : 32bit quantity for the device lock to * make it work out on a Sparc. * Bjorn Ekwall : Added KERNELD hack. * Alan Cox : Cleaned up the backlog initialise. * Craig Metz : SIOCGIFCONF fix if space for under * 1 device. * Thomas Bogendoerfer : Return ENODEV for dev_open, if there * is no device open function. * Andi Kleen : Fix error reporting for SIOCGIFCONF * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF * Cyrus Durgin : Cleaned for KMOD * Adam Sulmicki : Bug Fix : Network Device Unload * A network device unload needs to purge * the backlog queue. * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait * indefinitely on dev->refcnt * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/skbuff.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/busy_poll.h> #include <linux/rtnetlink.h> #include <linux/stat.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netpoll.h> #include <linux/rcupdate.h> #include <linux/delay.h> #include <net/iw_handler.h> #include <asm/current.h> #include <linux/audit.h> #include <linux/dmaengine.h> #include <linux/err.h> #include <linux/ctype.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <net/ip.h> #include <net/mpls.h> #include <linux/ipv6.h> #include <linux/in.h> #include <linux/jhash.h> #include <linux/random.h> #include <trace/events/napi.h> #include <trace/events/net.h> #include <trace/events/skb.h> #include <linux/pci.h> #include <linux/inetdevice.h> #include <linux/cpu_rmap.h> #include <linux/static_key.h> #include <linux/hashtable.h> #include <linux/vmalloc.h> #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> #include <linux/netfilter_ingress.h> #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> #include <linux/net_namespace.h> #include "net-sysfs.h" #define MAX_GRO_SKBS 8 #define MAX_NEST_DEV 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(ptype_lock); static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); static struct napi_struct *napi_by_id(unsigned int napi_id); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. * * Pure readers hold dev_base_lock for reading, or rcu_read_lock() * * Writers must hold the rtnl semaphore while they loop through the * dev_base_head list, and hold dev_base_lock for writing when they do the * actual updates. This allows pure readers to access the list even * while a writer is preparing to update it. * * To put it another way, dev_base_lock is held for writing only to * protect against pure readers; the rtnl semaphore provides the * protection against other writers. * * See, for example usages, register_netdevice() and * unregister_netdevice(), which must be called with the rtnl * semaphore held. */ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static DECLARE_RWSEM(devnet_rename_sem); static inline void dev_base_seq_inc(struct net *net) { while (++net->dev_base_seq == 0) ; } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) { unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; } static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) { return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; } static inline void rps_lock(struct softnet_data *sd) { #ifdef CONFIG_RPS spin_lock(&sd->input_pkt_queue.lock); #endif } static inline void rps_unlock(struct softnet_data *sd) { #ifdef CONFIG_RPS spin_unlock(&sd->input_pkt_queue.lock); #endif } /* Device list insertion */ static void list_netdevice(struct net_device *dev) { struct net *net = dev_net(dev); ASSERT_RTNL(); write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); dev_base_seq_inc(net); } /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ static void unlist_netdevice(struct net_device *dev) { ASSERT_RTNL(); /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); hlist_del_rcu(&dev->name_hlist); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } /* * Our notifier list */ static RAW_NOTIFIER_HEAD(netdev_chain); /* * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ static const unsigned short netdev_lock_type[] = { ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; static const char *const netdev_lock_name[] = { "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { int i; for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) if (netdev_lock_type[i] == dev_type) return i; /* the last key is used by default */ return ARRAY_SIZE(netdev_lock_type) - 1; } static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { int i; i = netdev_lock_pos(dev_type); lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { int i; i = netdev_lock_pos(dev->type); lockdep_set_class_and_name(&dev->addr_list_lock, &netdev_addr_lock_key[i], netdev_lock_name[i]); } #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } static inline void netdev_set_addr_lockdep_class(struct net_device *dev) { } #endif /******************************************************************************* * * Protocol management and registration routines * *******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is * smarter we can dispense with all the messy stuff that used to be * here. * * BEWARE!!! Protocol handlers, mangling input packets, * MUST BE last in hash buckets and checking protocol handlers * MUST start from promiscuous ptype_all chain in net_bh. * It is true now, do not change it. * Explanation follows: if protocol handler, mangling packet, will * be the first on list, it is not able to sense, that packet * is cloned and should be copied-on-write, so that it will * change it and subsequent readers will get broken packet. * --ANK (980803) */ static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) return pt->dev ? &pt->dev->ptype_all : &ptype_all; else return pt->dev ? &pt->dev->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** * dev_add_pack - add packet handler * @pt: packet type declaration * * Add a protocol handler to the networking stack. The passed &packet_type * is linked into kernel lists and may not be freed until it has been * removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new packet type (until the next received packet). */ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); /** * __dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ void __dev_remove_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); struct packet_type *pt1; spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { list_del_rcu(&pt->list); goto out; } } pr_warn("dev_remove_pack: %p not found\n", pt); out: spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); /** * dev_remove_pack - remove packet handler * @pt: packet type declaration * * Remove a protocol handler that was previously added to the kernel * protocol handlers by dev_add_pack(). The passed &packet_type is removed * from the kernel lists and can be freed or reused once this function * returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_pack(struct packet_type *pt) { __dev_remove_pack(pt); synchronize_net(); } EXPORT_SYMBOL(dev_remove_pack); /** * dev_add_offload - register offload handlers * @po: protocol offload declaration * * Add protocol offload handlers to the networking stack. The passed * &proto_offload is linked into kernel lists and may not be freed until * it has been removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new offload handlers (until the next received packet). */ void dev_add_offload(struct packet_offload *po) { struct packet_offload *elem; spin_lock(&offload_lock); list_for_each_entry(elem, &offload_base, list) { if (po->priority < elem->priority) break; } list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); /** * __dev_remove_offload - remove offload handler * @po: packet offload declaration * * Remove a protocol offload handler that was previously added to the * kernel offload handlers by dev_add_offload(). The passed &offload_type * is removed from the kernel lists and can be freed or reused once this * function returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &offload_base; struct packet_offload *po1; spin_lock(&offload_lock); list_for_each_entry(po1, head, list) { if (po == po1) { list_del_rcu(&po->list); goto out; } } pr_warn("dev_remove_offload: %p not found\n", po); out: spin_unlock(&offload_lock); } /** * dev_remove_offload - remove packet offload handler * @po: packet offload declaration * * Remove a packet offload handler that was previously added to the kernel * offload handlers by dev_add_offload(). The passed &offload_type is * removed from the kernel lists and can be freed or reused once this * function returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_offload(struct packet_offload *po) { __dev_remove_offload(po); synchronize_net(); } EXPORT_SYMBOL(dev_remove_offload); /****************************************************************************** * * Device Boot-time Settings Routines * ******************************************************************************/ /* Boot time configuration table */ static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; /** * netdev_boot_setup_add - add new setup entry * @name: name of the device * @map: configured settings for the device * * Adds new setup entry to the dev_boot_setup list. The function * returns 0 on error and 1 on success. This is a generic routine to * all netdevices. */ static int netdev_boot_setup_add(char *name, struct ifmap *map) { struct netdev_boot_setup *s; int i; s = dev_boot_setup; for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { memset(s[i].name, 0, sizeof(s[i].name)); strlcpy(s[i].name, name, IFNAMSIZ); memcpy(&s[i].map, map, sizeof(s[i].map)); break; } } return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; } /** * netdev_boot_setup_check - check boot time settings * @dev: the netdevice * * Check boot time settings for the device. * The found settings are set for the device to be used * later in the device probing. * Returns 0 if no settings found, 1 if they are. */ int netdev_boot_setup_check(struct net_device *dev) { struct netdev_boot_setup *s = dev_boot_setup; int i; for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && !strcmp(dev->name, s[i].name)) { dev->irq = s[i].map.irq; dev->base_addr = s[i].map.base_addr; dev->mem_start = s[i].map.mem_start; dev->mem_end = s[i].map.mem_end; return 1; } } return 0; } EXPORT_SYMBOL(netdev_boot_setup_check); /** * netdev_boot_base - get address from boot time settings * @prefix: prefix for network device * @unit: id for network device * * Check boot time settings for the base address of device. * The found settings are set for the device to be used * later in the device probing. * Returns 0 if no settings found. */ unsigned long netdev_boot_base(const char *prefix, int unit) { const struct netdev_boot_setup *s = dev_boot_setup; char name[IFNAMSIZ]; int i; sprintf(name, "%s%d", prefix, unit); /* * If device already registered then return base of 1 * to indicate not to probe for this interface */ if (__dev_get_by_name(&init_net, name)) return 1; for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) if (!strcmp(name, s[i].name)) return s[i].map.base_addr; return 0; } /* * Saves at boot time configured settings for any netdevice. */ int __init netdev_boot_setup(char *str) { int ints[5]; struct ifmap map; str = get_options(str, ARRAY_SIZE(ints), ints); if (!str || !*str) return 0; /* Save settings */ memset(&map, 0, sizeof(map)); if (ints[0] > 0) map.irq = ints[1]; if (ints[0] > 1) map.base_addr = ints[2]; if (ints[0] > 2) map.mem_start = ints[3]; if (ints[0] > 3) map.mem_end = ints[4]; /* Add new entry to the list */ return netdev_boot_setup_add(str, &map); } __setup("netdev=", netdev_boot_setup); /******************************************************************************* * * Device Interface Subroutines * *******************************************************************************/ /** * dev_get_iflink - get 'iflink' value of a interface * @dev: targeted interface * * Indicates the ifindex the interface is linked to. * Physical interfaces have the same 'ifindex' and 'iflink' values. */ int dev_get_iflink(const struct net_device *dev) { if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) return dev->netdev_ops->ndo_get_iflink(dev); return dev->ifindex; } EXPORT_SYMBOL(dev_get_iflink); /** * dev_fill_metadata_dst - Retrieve tunnel egress information. * @dev: targeted interface * @skb: The packet. * * For better visibility of tunnel traffic OVS needs to retrieve * egress tunnel information for a packet. Following API allows * user to get this info. */ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info; if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) return -EINVAL; info = skb_tunnel_info_unclone(skb); if (!info) return -ENOMEM; if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) return -EINVAL; return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. Must be called under RTNL semaphore * or @dev_base_lock. If the name is found a pointer to the device * is returned. If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ struct net_device *__dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; struct hlist_head *head = dev_name_hash(net, name); hlist_for_each_entry(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; return NULL; } EXPORT_SYMBOL(__dev_get_by_name); /** * dev_get_by_name_rcu - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. * If the name is found a pointer to the device is returned. * If the name is not found then %NULL is returned. * The reference counters are not incremented so the caller must be * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { struct net_device *dev; struct hlist_head *head = dev_name_hash(net, name); hlist_for_each_entry_rcu(dev, head, name_hlist) if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; return NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); /** * dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find * * Find an interface by name. This can be called from any * context and does its own locking. The returned handle has * the usage count incremented and the caller must use dev_put() to * release it when it is no longer needed. %NULL is returned if no * matching device is found. */ struct net_device *dev_get_by_name(struct net *net, const char *name) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); if (dev) dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_name); /** * __dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold either the RTNL semaphore * or @dev_base_lock. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(__dev_get_by_index); /** * dev_get_by_index_rcu - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) { struct net_device *dev; struct hlist_head *head = dev_index_hash(net, ifindex); hlist_for_each_entry_rcu(dev, head, index_hlist) if (dev->ifindex == ifindex) return dev; return NULL; } EXPORT_SYMBOL(dev_get_by_index_rcu); /** * dev_get_by_index - find a device by its ifindex * @net: the applicable net namespace * @ifindex: index of device * * Search for an interface by index. Returns NULL if the device * is not found or a pointer to the device. The device returned has * had a reference added and the pointer is safe until the user calls * dev_put to indicate they have finished with it. */ struct net_device *dev_get_by_index(struct net *net, int ifindex) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) dev_hold(dev); rcu_read_unlock(); return dev; } EXPORT_SYMBOL(dev_get_by_index); /** * dev_get_by_napi_id - find a device by napi_id * @napi_id: ID of the NAPI struct * * Search for an interface by NAPI ID. Returns %NULL if the device * is not found or a pointer to the device. The device has not had * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); if (napi_id < MIN_NAPI_ID) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } EXPORT_SYMBOL(dev_get_by_napi_id); /** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. * @ifindex: the ifindex of the interface to get the name from. */ int netdev_get_name(struct net *net, char *name, int ifindex) { struct net_device *dev; int ret; down_read(&devnet_rename_sem); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (!dev) { ret = -ENODEV; goto out; } strcpy(name, dev->name); ret = 0; out: rcu_read_unlock(); up_read(&devnet_rename_sem); return ret; } /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace * @type: media type of device * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. * The caller must hold RCU or RTNL. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * */ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *ha) { struct net_device *dev; for_each_netdev_rcu(net, dev) if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev; ASSERT_RTNL(); for_each_netdev(net, dev) if (dev->type == type) return dev; return NULL; } EXPORT_SYMBOL(__dev_getfirstbyhwtype); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; rcu_read_lock(); for_each_netdev_rcu(net, dev) if (dev->type == type) { dev_hold(dev); ret = dev; break; } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(dev_getfirstbyhwtype); /** * __dev_get_by_flags - find any device with given flags * @net: the applicable net namespace * @if_flags: IFF_* values * @mask: bitmask of bits in if_flags to check * * Search for any interface with the given flags. Returns NULL if a device * is not found or a pointer to the device. Must be called inside * rtnl_lock(), and result refcount is unchanged. */ struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask) { struct net_device *dev, *ret; ASSERT_RTNL(); ret = NULL; for_each_netdev(net, dev) { if (((dev->flags ^ if_flags) & mask) == 0) { ret = dev; break; } } return ret; } EXPORT_SYMBOL(__dev_get_by_flags); /** * dev_valid_name - check if name is okay for network device * @name: name string * * Network device names need to be valid file names to * to allow sysfs to work. We also disallow any kind of * whitespace. */ bool dev_valid_name(const char *name) { if (*name == '\0') return false; if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; while (*name) { if (*name == '/' || *name == ':' || isspace(*name)) return false; name++; } return true; } EXPORT_SYMBOL(dev_valid_name); /** * __dev_alloc_name - allocate a name for a device * @net: network namespace to allocate the device name in * @name: name format string * @buf: scratch buffer and result name string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ static int __dev_alloc_name(struct net *net, const char *name, char *buf) { int i = 0; const char *p; const int max_netdevices = 8*PAGE_SIZE; unsigned long *inuse; struct net_device *d; if (!dev_valid_name(name)) return -EINVAL; p = strchr(name, '%'); if (p) { /* * Verify the string as this thing may have come from * the user. There must be either one "%d" and no other "%" * characters. */ if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; /* Use one page as a bit array of possible slots */ inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); if (!inuse) return -ENOMEM; for_each_netdev(net, d) { if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) continue; /* avoid cases where sscanf is not exact inverse of printf */ snprintf(buf, IFNAMSIZ, name, i); if (!strncmp(buf, d->name, IFNAMSIZ)) set_bit(i, inuse); } i = find_first_zero_bit(inuse, max_netdevices); free_page((unsigned long) inuse); } snprintf(buf, IFNAMSIZ, name, i); if (!__dev_get_by_name(net, buf)) return i; /* It is possible to run out of possible slots * when the name is long and there isn't enough space left * for the digits, or if all bits are used. */ return -ENFILE; } static int dev_alloc_name_ns(struct net *net, struct net_device *dev, const char *name) { char buf[IFNAMSIZ]; int ret; BUG_ON(!net); ret = __dev_alloc_name(net, name, buf); if (ret >= 0) strlcpy(dev->name, buf, IFNAMSIZ); return ret; } /** * dev_alloc_name - allocate a name for a device * @dev: device * @name: name format string * * Passed a format string - eg "lt%d" it will try and find a suitable * id. It scans list of devices to build up a free map, then chooses * the first empty slot. The caller must hold the dev_base or rtnl lock * while allocating the name and adding the device in order to avoid * duplicates. * Limited to bits_per_byte * page size devices (ie 32K on most platforms). * Returns the number of the unit assigned or a negative errno code. */ int dev_alloc_name(struct net_device *dev, const char *name) { return dev_alloc_name_ns(dev_net(dev), dev, name); } EXPORT_SYMBOL(dev_alloc_name); int dev_get_valid_name(struct net *net, struct net_device *dev, const char *name) { BUG_ON(!net); if (!dev_valid_name(name)) return -EINVAL; if (strchr(name, '%')) return dev_alloc_name_ns(net, dev, name); else if (__dev_get_by_name(net, name)) return -EEXIST; else if (dev->name != name) strlcpy(dev->name, name, IFNAMSIZ); return 0; } EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device * @dev: device * @newname: name (or format string) must be at least IFNAMSIZ * * Change name of a device, can pass format strings "eth%d". * for wildcarding. */ int dev_change_name(struct net_device *dev, const char *newname) { unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; struct net *net; ASSERT_RTNL(); BUG_ON(!dev_net(dev)); net = dev_net(dev); /* Some auto-enslaved devices e.g. failover slaves are * special, as userspace might rename the device after * the interface had been brought up and running since * the point kernel initiated auto-enslavement. Allow * live name change even when these slave devices are * up and running. * * Typically, users of these auto-enslaving devices * don't actually care about slave name change, as * they are supposed to operate on master interface * directly. */ if (dev->flags & IFF_UP && likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) return -EBUSY; down_write(&devnet_rename_sem); if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { up_write(&devnet_rename_sem); return 0; } memcpy(oldname, dev->name, IFNAMSIZ); err = dev_get_valid_name(net, dev, newname); if (err < 0) { up_write(&devnet_rename_sem); return err; } if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s\n", oldname); old_assign_type = dev->name_assign_type; dev->name_assign_type = NET_NAME_RENAMED; rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); dev->name_assign_type = old_assign_type; up_write(&devnet_rename_sem); return ret; } up_write(&devnet_rename_sem); netdev_adjacent_rename_links(dev, oldname); write_lock_bh(&dev_base_lock); hlist_del_rcu(&dev->name_hlist); write_unlock_bh(&dev_base_lock); synchronize_rcu(); write_lock_bh(&dev_base_lock); hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); if (ret) { /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; down_write(&devnet_rename_sem); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); dev->name_assign_type = old_assign_type; old_assign_type = NET_NAME_RENAMED; goto rollback; } else { pr_err("%s: name change rollback failed: %d\n", dev->name, ret); } } return err; } /** * dev_set_alias - change ifalias of a device * @dev: device * @alias: name up to IFALIASZ * @len: limit of bytes to copy from info * * Set ifalias for a device, */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; if (len) { new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); if (!new_alias) return -ENOMEM; memcpy(new_alias->ifalias, alias, len); new_alias->ifalias[len] = 0; } mutex_lock(&ifalias_mutex); rcu_swap_protected(dev->ifalias, new_alias, mutex_is_locked(&ifalias_mutex)); mutex_unlock(&ifalias_mutex); if (new_alias) kfree_rcu(new_alias, rcuhead); return len; } EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device * @dev: device * @name: buffer to store name of ifalias * @len: size of buffer * * get ifalias for a device. Caller must make sure dev cannot go * away, e.g. rcu read lock or own a reference count to device. */ int dev_get_alias(const struct net_device *dev, char *name, size_t len) { const struct dev_ifalias *alias; int ret = 0; rcu_read_lock(); alias = rcu_dereference(dev->ifalias); if (alias) ret = snprintf(name, len, "%s", alias->ifalias); rcu_read_unlock(); return ret; } /** * netdev_features_change - device changes features * @dev: device to cause notification * * Called to indicate a device has changed features. */ void netdev_features_change(struct net_device *dev) { call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); } EXPORT_SYMBOL(netdev_features_change); /** * netdev_state_change - device changes state * @dev: device to cause notification * * Called to indicate a device has changed state. This function calls * the notifier chains for netdev_chain and sends a NEWLINK message * to the routing socket. */ void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } } EXPORT_SYMBOL(netdev_state_change); /** * netdev_notify_peers - notify network peers about existence of @dev * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when * a device wants to inform the rest of the network about some sort of * reconfiguration such as a failover event or virtual machine * migration. */ void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); static int __dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; int ret; ASSERT_RTNL(); if (!netif_device_present(dev)) return -ENODEV; /* Block netpoll from trying to do any rx path servicing. * If we don't do this there is a chance ndo_poll_controller * or ndo_poll may be running while we open the device */ netpoll_poll_disable(dev); ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); ret = notifier_to_errno(ret); if (ret) return ret; set_bit(__LINK_STATE_START, &dev->state); if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); if (!ret && ops->ndo_open) ret = ops->ndo_open(dev); netpoll_poll_enable(dev); if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { dev->flags |= IFF_UP; dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); } return ret; } /** * dev_open - prepare an interface for use. * @dev: device to open * * Takes a device from down to up state. The device's private open * function is invoked and then the multicast lists are loaded. Finally * the device is moved into the up state and a %NETDEV_UP message is * sent to the netdev notifier chain. * * Calling this function on an active interface is a nop. On a failure * a negative errno code is returned. */ int dev_open(struct net_device *dev) { int ret; if (dev->flags & IFF_UP) return 0; ret = __dev_open(dev); if (ret < 0) return ret; rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; } EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { struct net_device *dev; ASSERT_RTNL(); might_sleep(); list_for_each_entry(dev, head, close_list) { /* Temporarily disable netpoll until the interface is down */ netpoll_poll_disable(dev); call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); clear_bit(__LINK_STATE_START, &dev->state); /* Synchronize to scheduled poll. We cannot touch poll list, it * can be even on different cpu. So just clear netif_running(). * * dev->stop() will invoke napi_disable() on all of it's * napi_struct instances on this device. */ smp_mb__after_atomic(); /* Commit netif_running(). */ } dev_deactivate_many(head); list_for_each_entry(dev, head, close_list) { const struct net_device_ops *ops = dev->netdev_ops; /* * Call the device specific close. This cannot fail. * Only if device is UP * * We allow it to be called even after a DETACH hot-plug * event. */ if (ops->ndo_stop) ops->ndo_stop(dev); dev->flags &= ~IFF_UP; netpoll_poll_enable(dev); } } static void __dev_close(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->close_list, &single); __dev_close_many(&single); list_del(&single); } void dev_close_many(struct list_head *head, bool unlink) { struct net_device *dev, *tmp; /* Remove the devices that don't need to be closed */ list_for_each_entry_safe(dev, tmp, head, close_list) if (!(dev->flags & IFF_UP)) list_del_init(&dev->close_list); __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); call_netdevice_notifiers(NETDEV_DOWN, dev); if (unlink) list_del_init(&dev->close_list); } } EXPORT_SYMBOL(dev_close_many); /** * dev_close - shutdown an interface. * @dev: device to shutdown * * This function moves an active device into down state. A * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier * chain. */ void dev_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); list_add(&dev->close_list, &single); dev_close_many(&single, true); list_del(&single); } } EXPORT_SYMBOL(dev_close); /** * dev_disable_lro - disable Large Receive Offload on a device * @dev: device * * Disable Large Receive Offload (LRO) on a net device. Must be * called under RTNL. This is needed if received packets may be * forwarded to another interface. */ void dev_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; dev->wanted_features &= ~NETIF_F_LRO; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); netdev_for_each_lower_dev(dev, lower_dev, iter) dev_disable_lro(lower_dev); } EXPORT_SYMBOL(dev_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device * @dev: device * * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be * called under RTNL. This is needed if Generic XDP is installed on * the device. */ static void dev_disable_gro_hw(struct net_device *dev) { dev->wanted_features &= ~NETIF_F_GRO_HW; netdev_update_features(dev); if (unlikely(dev->features & NETIF_F_GRO_HW)) netdev_WARN(dev, "failed to disable GRO_HW!\n"); } const char *netdev_cmd_to_name(enum netdev_cmd cmd) { #define N(val) \ case NETDEV_##val: \ return "NETDEV_" __stringify(val); switch (cmd) { N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) } #undef N return "UNKNOWN_NETDEV_EVENT"; } EXPORT_SYMBOL_GPL(netdev_cmd_to_name); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { struct netdev_notifier_info info = { .dev = dev, }; return nb->notifier_call(nb, val, &info); } static int dev_boot_phase = 1; /** * register_netdevice_notifier - register a network notifier block * @nb: notifier * * Register a notifier to be called when network device events occur. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. * * When registered all registration and up events are replayed * to the new notifier to allow device to have a race free * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) { struct net_device *dev; struct net_device *last; struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { for_each_netdev(net, dev) { err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); err = notifier_to_errno(err); if (err) goto rollback; if (!(dev->flags & IFF_UP)) continue; call_netdevice_notifier(nb, NETDEV_UP, dev); } } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; rollback: last = dev; for_each_net(net) { for_each_netdev(net, dev) { if (dev == last) goto outroll; if (dev->flags & IFF_UP) { call_netdevice_notifier(nb, NETDEV_GOING_DOWN, dev); call_netdevice_notifier(nb, NETDEV_DOWN, dev); } call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } EXPORT_SYMBOL(register_netdevice_notifier); /** * unregister_netdevice_notifier - unregister a network notifier block * @nb: notifier * * Unregister a notifier previously registered by * register_netdevice_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. * * After unregistering unregister and down device events are synthesized * for all devices on the device list to the removed notifier to remove * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) { struct net_device *dev; struct net *net; int err; /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) goto unlock; for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { call_netdevice_notifier(nb, NETDEV_GOING_DOWN, dev); call_netdevice_notifier(nb, NETDEV_DOWN, dev); } call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { ASSERT_RTNL(); return raw_notifier_call_chain(&netdev_chain, val, info); } /** * call_netdevice_notifiers - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { struct netdev_notifier_info info = { .dev = dev, }; return call_netdevice_notifiers_info(val, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); /** * call_netdevice_notifiers_mtu - call all network notifier blocks * @val: value passed unmodified to notifier function * @dev: net_device pointer passed unmodified to notifier function * @arg: additional u32 argument passed to the notifier function * * Call all network notifier blocks. Parameters and return value * are as for raw_notifier_call_chain(). */ static int call_netdevice_notifiers_mtu(unsigned long val, struct net_device *dev, u32 arg) { struct netdev_notifier_info_ext info = { .info.dev = dev, .ext.mtu = arg, }; BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); return call_netdevice_notifiers_info(val, &info.info); } #ifdef CONFIG_NET_INGRESS static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); void net_inc_ingress_queue(void) { static_branch_inc(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_ingress_queue); void net_dec_ingress_queue(void) { static_branch_dec(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif #ifdef CONFIG_NET_EGRESS static DEFINE_STATIC_KEY_FALSE(egress_needed_key); void net_inc_egress_queue(void) { static_branch_inc(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_egress_queue); void net_dec_egress_queue(void) { static_branch_dec(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); int wanted; wanted = atomic_add_return(deferred, &netstamp_wanted); if (wanted > 0) static_branch_enable(&netstamp_needed_key); else static_branch_disable(&netstamp_needed_key); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted; while (1) { wanted = atomic_read(&netstamp_wanted); if (wanted <= 0) break; if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) return; } atomic_inc(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_inc(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL int wanted; while (1) { wanted = atomic_read(&netstamp_wanted); if (wanted <= 1) break; if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) return; } atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_branch_dec(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; if (static_branch_unlikely(&netstamp_needed_key)) __net_timestamp(skb); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ __net_timestamp(SKB); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { unsigned int len; if (!(dev->flags & IFF_UP)) return false; len = dev->mtu + dev->hard_header_len + VLAN_HLEN; if (skb->len <= len) return true; /* if TSO is enabled, we don't care about the length as the packet * could be forwarded without being segmented before */ if (skb_is_gso(skb)) return true; return false; } EXPORT_SYMBOL_GPL(is_skb_forwardable); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { int ret = ____dev_forward_skb(dev, skb); if (likely(!ret)) { skb->protocol = eth_type_trans(skb, dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } return ret; } EXPORT_SYMBOL_GPL(__dev_forward_skb); /** * dev_forward_skb - loopback an skb to another netif * * @dev: destination network device * @skb: buffer to forward * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped, but freed) * * dev_forward_skb can be used for injecting an skb from the * start_xmit function of one device into the receive queue * of another device. * * The receiving device may be in another namespace, so * we have to clear all information in the skb that could * impact namespace isolation. */ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); } EXPORT_SYMBOL_GPL(dev_forward_skb); static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } static inline void deliver_ptype_list_skb(struct sk_buff *skb, struct packet_type **pt, struct net_device *orig_dev, __be16 type, struct list_head *ptype_list) { struct packet_type *ptype, *pt_prev = *pt; list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; if (pt_prev) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } *pt = pt_prev; } static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) return false; if (ptype->id_match) return ptype->id_match(ptype, skb->sk); else if ((struct sock *)ptype->af_packet_priv == skb->sk) return true; return false; } /* * Support routine. Sends outgoing frames to any network * taps currently in use. */ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; struct sk_buff *skb2 = NULL; struct packet_type *pt_prev = NULL; struct list_head *ptype_list = &ptype_all; rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ if (skb_loop_sk(ptype, skb)) continue; if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; } /* need to clone skb, done only once */ skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out_unlock; net_timestamp_set(skb2); /* skb->nh should be correctly * set by sender, so that the second statement is * just protection against buggy protocols. */ skb_reset_mac_header(skb2); if (skb_network_header(skb2) < skb2->data || skb_network_header(skb2) > skb_tail_pointer(skb2)) { net_crit_ratelimited("protocol %04x is buggy, dev %s\n", ntohs(skb2->protocol), dev->name); skb_reset_network_header(skb2); } skb2->transport_header = skb2->network_header; skb2->pkt_type = PACKET_OUTGOING; pt_prev = ptype; } if (ptype_list == &ptype_all) { ptype_list = &dev->ptype_all; goto again; } out_unlock: if (pt_prev) { if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC)) pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); else kfree_skb(skb2); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * * If real_num_tx_queues is changed the tc mappings may no longer be * valid. To resolve this verify the tc mapping remains valid and if * not NULL the mapping. With no priorities mapping to this * offset/count pair it will no longer be used. In the worst case TC0 * is invalid nothing can be done so disable priority mappings. If is * expected that drivers will fix this mapping if they can before * calling netif_set_real_num_tx_queues. */ static void netif_setup_tc(struct net_device *dev, unsigned int txq) { int i; struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } /* Invalidated prio to tc mappings set to TC0 */ for (i = 1; i < TC_BITMASK + 1; i++) { int q = netdev_get_prio_tc_map(dev, i); tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", i, q); netdev_set_prio_tc_map(dev, i, 0); } } } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) { if (dev->num_tc) { struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; int i; /* walk through the TCs and see if it falls into any of them */ for (i = 0; i < TC_MAX_QUEUE; i++, tc++) { if ((txq - tc->offset) < tc->count) return i; } /* didn't find it, just return -1 to indicate no match */ return -1; } return 0; } EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS struct static_key xps_needed __read_mostly; EXPORT_SYMBOL(xps_needed); struct static_key xps_rxqs_needed __read_mostly; EXPORT_SYMBOL(xps_rxqs_needed); static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; if (dev_maps) map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; for (pos = map->len; pos--;) { if (map->queues[pos] != index) continue; if (map->len > 1) { map->queues[pos] = map->queues[--map->len]; break; } RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; } return true; } static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { int num_tc = dev->num_tc ? : 1; bool active = false; int tci; for (tci = cpu * num_tc; num_tc--; tci++) { int i, j; for (i = count, j = offset; i--; j++) { if (!remove_xps_queue(dev_maps, tci, j)) break; } active |= i < 0; } return active; } static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, bool is_rxqs_map) { if (is_rxqs_map) { static_key_slow_dec_cpuslocked(&xps_rxqs_needed); RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); } else { RCU_INIT_POINTER(dev->xps_cpus_map, NULL); } static_key_slow_dec_cpuslocked(&xps_needed); kfree_rcu(dev_maps, rcu); } static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, struct xps_dev_maps *dev_maps, unsigned int nr_ids, u16 offset, u16 count, bool is_rxqs_map) { bool active = false; int i, j; for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), j < nr_ids;) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) reset_xps_maps(dev, dev_maps, is_rxqs_map); if (!is_rxqs_map) { for (i = offset + (count - 1); count--; i--) { netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; unsigned int nr_ids; if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); if (static_key_false(&xps_rxqs_needed)) { dev_maps = xmap_dereference(dev->xps_rxqs_map); if (dev_maps) { nr_ids = dev->num_rx_queues; clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, true); } } dev_maps = xmap_dereference(dev->xps_cpus_map); if (!dev_maps) goto out_no_maps; if (num_possible_cpus() > 1) possible_mask = cpumask_bits(cpu_possible_mask); nr_ids = nr_cpu_ids; clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, false); out_no_maps: mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) { netif_reset_xps_queues(dev, index, dev->num_tx_queues - index); } static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, u16 index, bool is_rxqs_map) { struct xps_map *new_map; int alloc_len = XPS_MIN_MAP_ALLOC; int i, pos; for (pos = 0; map && pos < map->len; pos++) { if (map->queues[pos] != index) continue; return map; } /* Need to add tx-queue to this CPU's/rx-queue's existing map */ if (map) { if (pos < map->alloc_len) return map; alloc_len = map->alloc_len * 2; } /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's * map */ if (is_rxqs_map) new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL); else new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, cpu_to_node(attr_index)); if (!new_map) return NULL; for (i = 0; i < pos; i++) new_map->queues[i] = map->queues[i]; new_map->alloc_len = alloc_len; new_map->len = pos; return new_map; } /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, bool is_rxqs_map) { const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; bool active = false; unsigned int nr_ids; if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; if (num_tc < 0) return -EINVAL; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) return -EINVAL; } mutex_lock(&xps_map_mutex); if (is_rxqs_map) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); dev_maps = xmap_dereference(dev->xps_rxqs_map); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) { online_mask = cpumask_bits(cpu_online_mask); possible_mask = cpumask_bits(cpu_possible_mask); } dev_maps = xmap_dereference(dev->xps_cpus_map); nr_ids = nr_cpu_ids; } if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { if (!new_dev_maps) new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { mutex_unlock(&xps_map_mutex); return -ENOMEM; } tci = j * num_tc + tc; map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) goto error; RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } if (!new_dev_maps) goto out_no_new_maps; if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); if (is_rxqs_map) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* We need to explicitly update tci as prevous loop * could break out early if dev_maps is NULL. */ tci = j * num_tc + tc; if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA if (!is_rxqs_map) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) numa_node_id = -1; } #endif } else if (dev_maps) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } } if (is_rxqs_map) rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); else rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = xmap_dereference(dev_maps->attr_map[tci]); if (map && map != new_map) kfree_rcu(map, rcu); } } kfree_rcu(dev_maps, rcu); out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: if (!is_rxqs_map) { /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); } if (!dev_maps) goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { for (i = tc, tci = j * num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); if (!netif_attr_test_mask(j, mask, nr_ids) || !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); } /* free map if not active */ if (!active) reset_xps_maps(dev, dev_maps, is_rxqs_map); out_no_maps: mutex_unlock(&xps_map_mutex); return 0; error: /* remove any maps that we added */ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) kfree(new_map); } } mutex_unlock(&xps_map_mutex); kfree(new_dev_maps); return -ENOMEM; } EXPORT_SYMBOL_GPL(__netif_set_xps_queue); int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { int ret; cpus_read_lock(); ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); cpus_read_unlock(); return ret; } EXPORT_SYMBOL(netif_set_xps_queue); #endif static void netdev_unbind_all_sb_channels(struct net_device *dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; /* Unbind any subordinate channels */ while (txq-- != &dev->_tx[0]) { if (txq->sb_dev) netdev_unbind_sb_channel(dev, txq->sb_dev); } } void netdev_reset_tc(struct net_device *dev) { #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); /* Reset TC configuration of device */ dev->num_tc = 0; memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); } EXPORT_SYMBOL(netdev_reset_tc); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) { if (tc >= dev->num_tc) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues(dev, offset, count); #endif dev->tc_to_txq[tc].count = count; dev->tc_to_txq[tc].offset = offset; return 0; } EXPORT_SYMBOL(netdev_set_tc_queue); int netdev_set_num_tc(struct net_device *dev, u8 num_tc) { if (num_tc > TC_MAX_QUEUE) return -EINVAL; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, 0); #endif netdev_unbind_all_sb_channels(dev); dev->num_tc = num_tc; return 0; } EXPORT_SYMBOL(netdev_set_num_tc); void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev) { struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues]; #ifdef CONFIG_XPS netif_reset_xps_queues_gt(sb_dev, 0); #endif memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq)); memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map)); while (txq-- != &dev->_tx[0]) { if (txq->sb_dev == sb_dev) txq->sb_dev = NULL; } } EXPORT_SYMBOL(netdev_unbind_sb_channel); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset) { /* Make certain the sb_dev and dev are already configured */ if (sb_dev->num_tc >= 0 || tc >= dev->num_tc) return -EINVAL; /* We cannot hand out queues we don't have */ if ((offset + count) > dev->real_num_tx_queues) return -EINVAL; /* Record the mapping */ sb_dev->tc_to_txq[tc].count = count; sb_dev->tc_to_txq[tc].offset = offset; /* Provide a way for Tx queue to find the tc_to_txq map or * XPS map for itself. */ while (count--) netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev; return 0; } EXPORT_SYMBOL(netdev_bind_sb_channel_queue); int netdev_set_sb_channel(struct net_device *dev, u16 channel) { /* Do not use a multiqueue device to represent a subordinate channel */ if (netif_is_multiqueue(dev)) return -ENODEV; /* We allow channels 1 - 32767 to be used for subordinate channels. * Channel 0 is meant to be "native" mode and used only to represent * the main root device. We allow writing 0 to reset the device back * to normal mode after being used as a subordinate channel. */ if (channel > S16_MAX) return -EINVAL; dev->num_tc = -channel; return 0; } EXPORT_SYMBOL(netdev_set_sb_channel); /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { bool disabling; int rc; disabling = txq < dev->real_num_tx_queues; if (txq < 1 || txq > dev->num_tx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); if (rc) return rc; if (dev->num_tc) netif_setup_tc(dev, txq); dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; if (disabling) { synchronize_net(); qdisc_reset_all_tx_gt(dev, txq); #ifdef CONFIG_XPS netif_reset_xps_queues_gt(dev, txq); #endif } } else { dev->real_num_tx_queues = txq; } return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); #ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device * @rxq: Actual number of RX queues * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a * negative error code. If called before registration, it always * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; if (rxq < 1 || rxq > dev->num_rx_queues) return -EINVAL; if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) return rc; } dev->real_num_rx_queues = rxq; return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** * netif_get_num_default_rss_queues - default number of RSS queues * * This routine should set an upper limit on the number of RSS queues * used by default by multiqueue devices. */ int netif_get_num_default_rss_queues(void) { return is_kdump_kernel() ? 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); } EXPORT_SYMBOL(netif_get_num_default_rss_queues); static void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; unsigned long flags; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); q->next_sched = NULL; *sd->output_queue_tailp = q; sd->output_queue_tailp = &q->next_sched; raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } void __netif_schedule(struct Qdisc *q) { if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } EXPORT_SYMBOL(__netif_schedule); struct dev_kfree_skb_cb { enum skb_free_reason reason; }; static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) { return (struct dev_kfree_skb_cb *)skb->cb; } void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); } rcu_read_unlock(); } EXPORT_SYMBOL(netif_schedule_queue); void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { struct Qdisc *q; rcu_read_lock(); q = rcu_dereference(dev_queue->qdisc); __netif_schedule(q); rcu_read_unlock(); } } EXPORT_SYMBOL(netif_tx_wake_queue); void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; if (unlikely(!skb)) return; if (likely(refcount_read(&skb->users) == 1)) { smp_rmb(); refcount_set(&skb->users, 0); } else if (likely(!refcount_dec_and_test(&skb->users))) { return; } get_kfree_skb_cb(skb)->reason = reason; local_irq_save(flags); skb->next = __this_cpu_read(softnet_data.completion_queue); __this_cpu_write(softnet_data.completion_queue, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__dev_kfree_skb_irq); void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { if (in_irq() || irqs_disabled()) __dev_kfree_skb_irq(skb, reason); else dev_kfree_skb(skb); } EXPORT_SYMBOL(__dev_kfree_skb_any); /** * netif_device_detach - mark device as removed * @dev: network device * * Mark device as removed from system and therefore no longer available. */ void netif_device_detach(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_stop_all_queues(dev); } } EXPORT_SYMBOL(netif_device_detach); /** * netif_device_attach - mark device as attached * @dev: network device * * Mark device as attached from system and restart if needed. */ void netif_device_attach(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); __netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); /* * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ static u16 skb_tx_hash(const struct net_device *dev, const struct net_device *sb_dev, struct sk_buff *skb) { u32 hash; u16 qoffset = 0; u16 qcount = dev->real_num_tx_queues; if (dev->num_tc) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; if (unlikely(!qcount)) { net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", sb_dev->name, qoffset, tc); qoffset = 0; qcount = dev->real_num_tx_queues; } } if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); if (hash >= qoffset) hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; } return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } static void skb_warn_bad_offload(const struct sk_buff *skb) { static const netdev_features_t null_features; struct net_device *dev = skb->dev; const char *name = ""; if (!net_ratelimit()) return; if (dev) { if (dev->dev.parent) name = dev_driver_string(dev->dev.parent); else name = netdev_name(dev); } WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " "gso_type=%d ip_summed=%d\n", name, dev ? &dev->features : &null_features, skb->sk ? &skb->sk->sk_route_caps : &null_features, skb->len, skb->data_len, skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type, skb->ip_summed); } /* * Invalidate hardware checksum when packet is to be mangled, and * complete checksum manually on outgoing path. */ int skb_checksum_help(struct sk_buff *skb) { __wsum csum; int ret = 0, offset; if (skb->ip_summed == CHECKSUM_COMPLETE) goto out_set_summed; if (unlikely(skb_shinfo(skb)->gso_size)) { skb_warn_bad_offload(skb); return -EINVAL; } /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (skb_has_shared_frag(skb)) { ret = __skb_linearize(skb); if (ret) goto out; } offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(__sum16))) { ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (ret) goto out; } *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: return ret; } EXPORT_SYMBOL(skb_checksum_help); int skb_crc32c_csum_help(struct sk_buff *skb) { __le32 crc32c_csum; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) goto out; if (unlikely(skb_is_gso(skb))) goto out; /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ if (unlikely(skb_has_shared_frag(skb))) { ret = __skb_linearize(skb); if (ret) goto out; } start = skb_checksum_start_offset(skb); offset = start + offsetof(struct sctphdr, checksum); if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { ret = -EINVAL; goto out; } if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(__le32))) { ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (ret) goto out; } crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); *(__le32 *)(skb->data + offset) = crc32c_csum; skb->ip_summed = CHECKSUM_NONE; skb->csum_not_inet = 0; out: return ret; } __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; /* Tunnel gso handlers can set protocol to ethernet. */ if (type == htons(ETH_P_TEB)) { struct ethhdr *eth; if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) return 0; eth = (struct ethhdr *)skb->data; type = eth->h_proto; } return __vlan_get_protocol(skb, type, depth); } /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) */ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; int vlan_depth = skb->mac_len; __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); __skb_push(skb, skb->data - skb_mac_header(skb)); return segs; } EXPORT_SYMBOL(skb_mac_gso_segment); /* openvswitch calls this on rx path, so we need a different check. */ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } /** * __skb_gso_segment - Perform segmentation on skb. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @tx_path: whether it is called in TX path * * This function segments the given skb and returns a list of segments. * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. * * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) { struct sk_buff *segs; if (unlikely(skb_needs_check(skb, tx_path))) { int err; /* We're going to init ->check field in TCP or UDP header */ err = skb_cow_head(skb, 0); if (err < 0) return ERR_PTR(err); } /* Only report GSO partial support if it will enable us to * support segmentation on this frame without needing additional * work. */ if (features & NETIF_F_GSO_PARTIAL) { netdev_features_t partial_features = NETIF_F_GSO_ROBUST; struct net_device *dev = skb->dev; partial_features |= dev->features & dev->gso_partial_features; if (!skb_gso_ok(skb, features | partial_features)) features &= ~NETIF_F_GSO_PARTIAL; } BUILD_BUG_ON(SKB_SGO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; skb_reset_mac_header(skb); skb_reset_mac_len(skb); segs = skb_mac_gso_segment(skb, features); if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; } EXPORT_SYMBOL(__skb_gso_segment); /* Take action when hardware reception checksum errors are detected. */ #ifdef CONFIG_BUG void netdev_rx_csum_fault(struct net_device *dev) { if (net_ratelimit()) { pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); dump_stack(); } } EXPORT_SYMBOL(netdev_rx_csum_fault); #endif /* XXX: check that highmem exists at all on the given machine. */ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; if (PageHighMem(skb_frag_page(frag))) return 1; } } #endif return 0; } /* If MPLS offload request, verify we are testing hardware MPLS features * instead of standard features for the netdev. */ #if IS_ENABLED(CONFIG_NET_MPLS_GSO) static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; } #else static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { return features; } #endif static netdev_features_t harmonize_features(struct sk_buff *skb, netdev_features_t features) { int tmp; __be16 type; type = skb_network_protocol(skb, &tmp); features = net_mpls_features(skb, features, type); if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } if (illegal_highdma(skb->dev, skb)) features &= ~NETIF_F_SG; return features; } netdev_features_t passthru_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return features; } EXPORT_SYMBOL(passthru_features_check); static netdev_features_t dflt_features_check(struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { return vlan_features_check(skb, features); } static netdev_features_t gso_features_check(const struct sk_buff *skb, struct net_device *dev, netdev_features_t features) { u16 gso_segs = skb_shinfo(skb)->gso_segs; if (gso_segs > dev->gso_max_segs) return features & ~NETIF_F_GSO_MASK; /* Support for GSO partial features requires software * intervention before we can actually process the packets * so we need to strip support for any partial features now * and we can pull them back in after we have partially * segmented the frame. */ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; /* Make sure to clear the IPv4 ID mangling feature if the * IPv4 header has the potential to be fragmented. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) features &= ~NETIF_F_TSO_MANGLEID; } return features; } netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; if (skb_is_gso(skb)) features = gso_features_check(skb, dev, features); /* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard * features for the netdev */ if (skb->encapsulation) features &= dev->hw_enc_features; if (skb_vlan_tagged(skb)) features = netdev_intersect_features(features, dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (dev->netdev_ops->ndo_features_check) features &= dev->netdev_ops->ndo_features_check(skb, dev, features); else features &= dflt_features_check(skb, dev, features); return harmonize_features(skb, features); } EXPORT_SYMBOL(netif_skb_features); static int xmit_one(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { unsigned int len; int rc; if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) dev_queue_xmit_nit(skb, dev); len = skb->len; trace_net_dev_start_xmit(skb, dev); rc = netdev_start_xmit(skb, dev, txq, more); trace_net_dev_xmit(skb, rc, dev, len); return rc; } struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, struct netdev_queue *txq, int *ret) { struct sk_buff *skb = first; int rc = NETDEV_TX_OK; while (skb) { struct sk_buff *next = skb->next; skb->next = NULL; rc = xmit_one(skb, dev, txq, next != NULL); if (unlikely(!dev_xmit_complete(rc))) { skb->next = next; goto out; } skb = next; if (netif_tx_queue_stopped(txq) && skb) { rc = NETDEV_TX_BUSY; break; } } out: *ret = rc; return skb; } static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; } int skb_csum_hwoffload_help(struct sk_buff *skb, const netdev_features_t features) { if (unlikely(skb->csum_not_inet)) return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb); return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); } EXPORT_SYMBOL(skb_csum_hwoffload_help); static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) goto out_null; skb = sk_validate_xmit_skb(skb, dev); if (unlikely(!skb)) goto out_null; if (netif_needs_gso(skb, features)) { struct sk_buff *segs; segs = skb_gso_segment(skb, features); if (IS_ERR(segs)) { goto out_kfree_skb; } else if (segs) { consume_skb(skb); skb = segs; } } else { if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { if (skb->encapsulation) skb_set_inner_transport_header(skb, skb_checksum_start_offset(skb)); else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } skb = validate_xmit_xfrm(skb, features, again); return skb; out_kfree_skb: kfree_skb(skb); out_null: atomic_long_inc(&dev->tx_dropped); return NULL; } struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; for (; skb != NULL; skb = next) { next = skb->next; skb->next = NULL; /* in case skb wont be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; if (!head) head = skb; else tail->next = skb; /* If skb was segmented, skb->prev points to * the last segment. If not, it still contains skb. */ tail = skb->prev; } return head; } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); static void qdisc_pkt_len_init(struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ if (shinfo->gso_size) { unsigned int hdr_len; u16 gso_segs = shinfo->gso_segs; /* mac layer + network layer */ hdr_len = skb_transport_header(skb) - skb_mac_header(skb); /* + transport layer */ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { const struct tcphdr *th; struct tcphdr _tcphdr; th = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_tcphdr), &_tcphdr); if (likely(th)) hdr_len += __tcp_hdrlen(th); } else { struct udphdr _udphdr; if (skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_udphdr), &_udphdr)) hdr_len += sizeof(struct udphdr); } if (shinfo->gso_type & SKB_GSO_DODGY) gso_segs = DIV_ROUND_UP(skb->len - hdr_len, shinfo->gso_size); qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } } static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); struct sk_buff *to_free = NULL; bool contended; int rc; qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; qdisc_run(q); } if (unlikely(to_free)) kfree_skb_list(to_free); return rc; } /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits qdisc->running owner to get the lock more * often and dequeue packets faster. */ contended = qdisc_is_running(q); if (unlikely(contended)) spin_lock(&q->busylock); spin_lock(root_lock); if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); } qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); contended = false; } __qdisc_run(q); qdisc_run_end(q); } } spin_unlock(root_lock); if (unlikely(to_free)) kfree_skb_list(to_free); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; } #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) static void skb_update_prio(struct sk_buff *skb) { const struct netprio_map *map; const struct sock *sk; unsigned int prioidx; if (skb->priority) return; map = rcu_dereference_bh(skb->dev->priomap); if (!map) return; sk = skb_to_full_sk(skb); if (!sk) return; prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data); if (prioidx < map->priomap_len) skb->priority = map->priomap[prioidx]; } #else #define skb_update_prio(skb) #endif /** * dev_loopback_xmit - loop back @skb * @net: network namespace this loopback is happening in * @sk: sk needed to be a netfilter okfn * @skb: buffer to transmit */ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; skb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(skb)); skb_dst_force(skb); netif_rx_ni(skb); return 0; } EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS static struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); struct tcf_result cl_res; if (!miniq) return skb; /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); break; case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); *ret = NET_XMIT_DROP; kfree_skb(skb); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *ret = NET_XMIT_SUCCESS; consume_skb(skb); return NULL; case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; return NULL; default: break; } return skb; } #endif /* CONFIG_NET_EGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { struct xps_map *map; int queue_index = -1; if (dev->num_tc) { tci *= dev->num_tc; tci += netdev_get_prio_tc_map(dev, skb->priority); } map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { if (map->len == 1) queue_index = map->queues[0]; else queue_index = map->queues[reciprocal_scale( skb_get_hash(skb), map->len)]; if (unlikely(queue_index >= dev->real_num_tx_queues)) queue_index = -1; } return queue_index; } #endif static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, struct sk_buff *skb) { #ifdef CONFIG_XPS struct xps_dev_maps *dev_maps; struct sock *sk = skb->sk; int queue_index = -1; if (!static_key_false(&xps_needed)) return -1; rcu_read_lock(); if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); if (dev_maps) { int tci = sk_rx_queue_get(sk); if (tci >= 0 && tci < dev->num_rx_queues) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } get_cpus_map: if (queue_index < 0) { dev_maps = rcu_dereference(sb_dev->xps_cpus_map); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } } rcu_read_unlock(); return queue_index; #else return -1; #endif } u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev, select_queue_fallback_t fallback) { return 0; } EXPORT_SYMBOL(dev_pick_tx_zero); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev, select_queue_fallback_t fallback) { return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; } EXPORT_SYMBOL(dev_pick_tx_cpu_id); static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); sb_dev = sb_dev ? : dev; if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { int new_index = get_xps_queue(dev, sb_dev, skb); if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); queue_index = new_index; } return queue_index; } struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { int queue_index = 0; #ifdef CONFIG_XPS u32 sender_cpu = skb->sender_cpu - 1; if (sender_cpu >= (u32)NR_CPUS) skb->sender_cpu = raw_smp_processor_id() + 1; #endif if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb, sb_dev, __netdev_pick_tx); else queue_index = __netdev_pick_tx(dev, skb, sb_dev); queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); return netdev_get_tx_queue(dev, queue_index); } /** * __dev_queue_xmit - transmit a buffer * @skb: buffer to transmit * @sb_dev: suboordinate device used for L2 forwarding offload * * Queue a buffer for transmission to a network device. The caller must * have set the device and priority and built the buffer before calling * this function. The function can be called from an interrupt. * * A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. * * ----------------------------------------------------------------------------------- * I notice this method can also return errors from the queue disciplines, * including NET_XMIT_DROP, which is a positive value. So, errors can also * be positive. * * Regardless of the return value, the skb is consumed, so it is currently * difficult to retry a send to this method. (You can bump the ref count * before sending to hold a reference for retry if you are careful.) * * When calling this method, interrupts MUST be enabled. This is because * the BH enable code must have IRQs enabled so that it will not deadlock. * --BLG */ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { struct net_device *dev = skb->dev; struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; bool again = false; skb_reset_mac_header(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also * stops preemption for RCU. */ rcu_read_lock_bh(); skb_update_prio(skb); qdisc_pkt_len_init(skb); #ifdef CONFIG_NET_CLS_ACT skb->tc_at_ingress = 0; # ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; } # endif #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. */ if (dev->priv_flags & IFF_XMIT_DST_RELEASE) skb_dst_drop(skb); else skb_dst_force(skb); txq = netdev_pick_tx(dev, skb, sb_dev); q = rcu_dereference_bh(txq->qdisc); trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; } /* The device has no queue. Common case for software devices: * loopback, all the sorts of tunnels... * Really, it is unlikely that netif_tx_lock protection is necessary * here. (f.e. loopback and IP tunnels are clean ignoring statistics * counters.) * However, it is possible, that they rely on protection * made by us here. * Check this and shot the lock. It is not prone from deadlocks. *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ /* Other cpus might concurrently change txq->xmit_lock_owner * to -1 or to their cpu id, but not to our id. */ if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { dev_xmit_recursion_inc(); skb = dev_hard_start_xmit(skb, dev, txq, &rc); dev_xmit_recursion_dec(); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; } } HARD_TX_UNLOCK(dev, txq); net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, * unfortunately */ recursion_alert: net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", dev->name); } } rc = -ENETDOWN; rcu_read_unlock_bh(); atomic_long_inc(&dev->tx_dropped); kfree_skb_list(skb); return rc; out: rcu_read_unlock_bh(); return rc; } int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); } EXPORT_SYMBOL(dev_queue_xmit); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) { return __dev_queue_xmit(skb, sb_dev); } EXPORT_SYMBOL(dev_queue_xmit_accel); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; struct netdev_queue *txq; int ret = NETDEV_TX_BUSY; bool again = false; if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) goto drop; skb = validate_xmit_skb_list(skb, dev, &again); if (skb != orig_skb) goto drop; skb_set_queue_mapping(skb, queue_id); txq = skb_get_tx_queue(dev, skb); local_bh_disable(); dev_xmit_recursion_inc(); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_drv_stopped(txq)) ret = netdev_start_xmit(skb, dev, txq, false); HARD_TX_UNLOCK(dev, txq); dev_xmit_recursion_dec(); local_bh_enable(); if (!dev_xmit_complete(ret)) kfree_skb(skb); return ret; drop: atomic_long_inc(&dev->tx_dropped); kfree_skb_list(skb); return NET_XMIT_DROP; } EXPORT_SYMBOL(dev_direct_xmit); /************************************************************************* * Receiver routines *************************************************************************/ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { list_add_tail(&napi->poll_list, &sd->poll_list); __raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS /* One global table that all flow-based protocols share. */ struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); u32 rps_cpu_mask __read_mostly; EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) { if (next_cpu < nr_cpu_ids) { #ifdef CONFIG_RFS_ACCEL struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; u32 flow_id; u16 rxq_index; int rc; /* Should we steer this flow to a different hardware queue? */ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || !(dev->features & NETIF_F_NTUPLE)) goto out; rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); if (rxq_index == skb_get_rx_queue(skb)) goto out; rxqueue = dev->_rx + rxq_index; flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; flow_id = skb_get_hash(skb) & flow_table->mask; rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; old_rflow = rflow; rflow = &flow_table->flows[flow_id]; rflow->filter = rc; if (old_rflow->filter == rflow->filter) old_rflow->filter = RPS_NO_FILTER; out: #endif rflow->last_qtail = per_cpu(softnet_data, next_cpu).input_queue_head; } rflow->cpu = next_cpu; return rflow; } /* * get_rps_cpu is called from netif_receive_skb and returns the target * CPU from the RPS map of the receiving queue for a given skb. * rcu_read_lock must be held on entry. */ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { const struct rps_sock_flow_table *sock_flow_table; struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; u32 tcpu; u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue += index; } /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); if (!flow_table && !map) goto done; skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) goto done; sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { struct rps_dev_flow *rflow; u32 next_cpu; u32 ident; /* First check into global flow table if there is a match */ ident = sock_flow_table->ents[hash & sock_flow_table->mask]; if ((ident ^ hash) & ~rps_cpu_mask) goto try_rps; next_cpu = ident & rps_cpu_mask; /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow * table entry), switch if one of the following holds: * - Current CPU is unset (>= nr_cpu_ids). * - Current CPU is offline. * - The current CPU's queue tail has advanced beyond the * last packet that was enqueued using this table entry. * This guarantees that all previous packets for the flow * have been dequeued, thus preserving in order delivery. */ if (unlikely(tcpu != next_cpu) && (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { *rflowp = rflow; cpu = tcpu; goto done; } } try_rps: if (map) { tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { cpu = tcpu; goto done; } } done: return cpu; } #ifdef CONFIG_RFS_ACCEL /** * rps_may_expire_flow - check whether an RFS hardware filter may be removed * @dev: Device on which the filter was set * @rxq_index: RX queue index * @flow_id: Flow ID passed to ndo_rx_flow_steer() * @filter_id: Filter ID returned by ndo_rx_flow_steer() * * Drivers that implement ndo_rx_flow_steer() should periodically call * this function for each installed filter and remove the filters for * which it returns %true. */ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id) { struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id <= flow_table->mask) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (rflow->filter == filter_id && cpu < nr_cpu_ids && ((int)(per_cpu(softnet_data, cpu).input_queue_head - rflow->last_qtail) < (int)(10 * flow_table->mask))) expire = false; } rcu_read_unlock(); return expire; } EXPORT_SYMBOL(rps_may_expire_flow); #endif /* CONFIG_RFS_ACCEL */ /* Called from hardirq (IPI) context */ static void rps_trigger_softirq(void *data) { struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); sd->received_rps++; } #endif /* CONFIG_RPS */ /* * Check if this softnet_data structure is another cpu one * If yes, queue it to our IPI list and return 1 * If no, return 0 */ static int rps_ipi_queued(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *mysd = this_cpu_ptr(&softnet_data); if (sd != mysd) { sd->rps_ipi_next = mysd->rps_ipi_list; mysd->rps_ipi_list = sd; __raise_softirq_irqoff(NET_RX_SOFTIRQ); return 1; } #endif /* CONFIG_RPS */ return 0; } #ifdef CONFIG_NET_FLOW_LIMIT int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) { #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit *fl; struct softnet_data *sd; unsigned int old_flow, new_flow; if (qlen < (netdev_max_backlog >> 1)) return false; sd = this_cpu_ptr(&softnet_data); rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; fl->history_head++; fl->history_head &= FLOW_LIMIT_HISTORY - 1; if (likely(fl->buckets[old_flow])) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { fl->count++; rcu_read_unlock(); return true; } } rcu_read_unlock(); #endif return false; } /* * enqueue_to_backlog is called to queue an skb to a per CPU backlog * queue (may be a remote CPU queue). */ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { struct softnet_data *sd; unsigned long flags; unsigned int qlen; sd = &per_cpu(softnet_data, cpu); local_irq_save(flags); rps_lock(sd); if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); input_queue_tail_incr_save(sd, qtail); rps_unlock(sd); local_irq_restore(flags); return NET_RX_SUCCESS; } /* Schedule NAPI for backlog device * We can use non atomic operation since we own the queue lock */ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { if (!rps_ipi_queued(sd)) ____napi_schedule(sd, &sd->backlog); } goto enqueue; } drop: sd->dropped++; rps_unlock(sd); local_irq_restore(flags); atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct netdev_rx_queue *rxqueue; rxqueue = dev->_rx; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " "of RX queues is %u\n", dev->name, index, dev->real_num_rx_queues); return rxqueue; /* Return first rxqueue */ } rxqueue += index; } return rxqueue; } static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; __be16 orig_eth_type; struct ethhdr *eth; bool orig_bcast; int hlen, off; u32 mac_len; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ if (skb_is_tc_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also * native XDP provides, thus we need to do it here as well. */ if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); int troom = skb->tail + skb->data_len - skb->end; /* In case we have to go down the path and also linearize, * then lets do the pskb_expand_head() work just once here. */ if (pskb_expand_head(skb, hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) goto do_drop; if (skb_linearize(skb)) goto do_drop; } /* The XDP program wants to see the packet starting at the MAC * header. */ mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; xdp->data = skb->data - mac_len; xdp->data_meta = xdp->data; xdp->data_end = xdp->data + hlen; xdp->data_hard_start = skb->data - skb_headroom(skb); orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; rxqueue = netif_get_rxqueue(skb); xdp->rxq = &rxqueue->xdp_rxq; act = bpf_prog_run_xdp(xdp_prog, xdp); /* check if bpf_xdp_adjust_head was used */ off = xdp->data - orig_data; if (off) { if (off > 0) __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); skb->mac_header += off; skb_reset_network_header(skb); } /* check if bpf_xdp_adjust_tail was used. it can only "shrink" * pckt. */ off = orig_data_end - xdp->data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len -= off; } /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); } switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); break; case XDP_PASS: metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; default: bpf_warn_invalid_xdp_action(act); /* fall through */ case XDP_ABORTED: trace_xdp_exception(skb->dev, xdp_prog, act); /* fall through */ case XDP_DROP: do_drop: kfree_skb(skb); break; } return act; } /* When doing generic XDP we have to bypass the qdisc layer and the * network taps in order to match in-driver-XDP behavior. */ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; bool free_skb = true; int cpu, rc; txq = netdev_pick_tx(dev, skb, NULL); cpu = smp_processor_id(); HARD_TX_LOCK(dev, txq, cpu); if (!netif_xmit_stopped(txq)) { rc = netdev_start_xmit(skb, dev, txq, 0); if (dev_xmit_complete(rc)) free_skb = false; } HARD_TX_UNLOCK(dev, txq); if (free_skb) { trace_xdp_exception(dev, xdp_prog, XDP_TX); kfree_skb(skb); } } EXPORT_SYMBOL_GPL(generic_xdp_tx); static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect(skb->dev, skb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: generic_xdp_tx(skb, xdp_prog); break; } return XDP_DROP; } } return XDP_PASS; out_redir: kfree_skb(skb); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); static int netif_rx_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_rx(skb); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; preempt_disable(); rcu_read_lock(); cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu < 0) cpu = smp_processor_id(); ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); preempt_enable(); } else #endif { unsigned int qtail; ret = enqueue_to_backlog(skb, get_cpu(), &qtail); put_cpu(); } return ret; } /** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for * the upper (protocol) levels to process. It always succeeds. The buffer * may be dropped during processing for congestion control or by the * protocol layers. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ int netif_rx(struct sk_buff *skb) { trace_netif_rx_entry(skb); return netif_rx_internal(skb); } EXPORT_SYMBOL(netif_rx); int netif_rx_ni(struct sk_buff *skb) { int err; trace_netif_rx_ni_entry(skb); preempt_disable(); err = netif_rx_internal(skb); if (local_softirq_pending()) do_softirq(); preempt_enable(); return err; } EXPORT_SYMBOL(netif_rx_ni); static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); if (sd->completion_queue) { struct sk_buff *clist; local_irq_disable(); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_enable(); while (clist) { struct sk_buff *skb = clist; clist = clist->next; WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) trace_consume_skb(skb); else trace_kfree_skb(skb, net_tx_action); if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __kfree_skb_defer(skb); } __kfree_skb_flush(); } if (sd->output_queue) { struct Qdisc *head; local_irq_disable(); head = sd->output_queue; sd->output_queue = NULL; sd->output_queue_tailp = &sd->output_queue; local_irq_enable(); while (head) { struct Qdisc *q = head; spinlock_t *root_lock = NULL; head = head->next_sched; if (!(q->flags & TCQ_F_NOLOCK)) { root_lock = qdisc_lock(q); spin_lock(root_lock); } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); if (root_lock) spin_unlock(root_lock); } } xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif static inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); struct tcf_result cl_res; /* If there's at least one ingress present somewhere (so * we get here via enabled static key), remaining devices * that are not configured with an ingress qdisc will bail * out here. */ if (!miniq) return skb; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } qdisc_skb_cb(skb)->pkt_len = skb->len; skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); break; case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); kfree_skb(skb); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); return NULL; case TC_ACT_REDIRECT: /* skb_mac_header check was done by cls/act_bpf, so * we can safely push the L2 header back before * redirecting to another netdev */ __skb_push(skb, skb->mac_len); skb_do_redirect(skb); return NULL; case TC_ACT_REINSERT: /* this does not scrub the packet, and updates stats on error */ skb_tc_reinsert(skb, &cl_res); return NULL; default: break; } #endif /* CONFIG_NET_CLS_ACT */ return skb; } /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check * * Check if a receive handler is already registered for a given device. * Return true if there one. * * The caller must hold the rtnl_mutex. */ bool netdev_is_rx_handler_busy(struct net_device *dev) { ASSERT_RTNL(); return dev && rtnl_dereference(dev->rx_handler); } EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for * @rx_handler: receive handler to register * @rx_handler_data: data pointer that is used by rx handler * * Register a receive handler for a device. This handler will then be * called from __netif_receive_skb. A negative errno code is returned * on a failure. * * The caller must hold the rtnl_mutex. * * For a general description of rx_handler, see enum rx_handler_result. */ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { if (netdev_is_rx_handler_busy(dev)) return -EBUSY; if (dev->priv_flags & IFF_NO_RX_HANDLER) return -EINVAL; /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); return 0; } EXPORT_SYMBOL_GPL(netdev_rx_handler_register); /** * netdev_rx_handler_unregister - unregister receive handler * @dev: device to unregister a handler from * * Unregister a receive handler from a device. * * The caller must hold the rtnl_mutex. */ void netdev_rx_handler_unregister(struct net_device *dev) { ASSERT_RTNL(); RCU_INIT_POINTER(dev->rx_handler, NULL); /* a reader seeing a non NULL rx_handler in a rcu_read_lock() * section has a guarantee to see a non NULL rx_handler_data * as well. */ synchronize_net(); RCU_INIT_POINTER(dev->rx_handler_data, NULL); } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); /* * Limit the use of PFMEMALLOC reserves to those protocols that implement * the special handling of PFMEMALLOC skbs. */ static bool skb_pfmemalloc_protocol(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_ARP): case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { #ifdef CONFIG_NETFILTER_INGRESS if (nf_hook_ingress_active(skb)) { int ingress_retval; if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } rcu_read_lock(); ingress_retval = nf_hook_ingress(skb); rcu_read_unlock(); return ingress_retval; } #endif /* CONFIG_NETFILTER_INGRESS */ return 0; } static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; struct net_device *orig_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_receive_skb(skb); orig_dev = skb->dev; skb_reset_network_header(skb); if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); skb_reset_mac_len(skb); pt_prev = NULL; another_round: skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; preempt_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); preempt_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; goto out; } skb_reset_mac_len(skb); } if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) goto out; } if (skb_skip_tc_classify(skb)) goto skip_classify; if (pfmemalloc) goto skip_taps; list_for_each_entry_rcu(ptype, &ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } #endif skb_reset_tc(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto out; } rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: ret = NET_RX_SUCCESS; goto out; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; case RX_HANDLER_PASS: break; default: BUG(); } } if (unlikely(skb_vlan_tag_present(skb))) { if (skb_vlan_tag_get_id(skb)) skb->pkt_type = PACKET_OTHERHOST; /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point */ skb->vlan_tci = 0; } type = skb->protocol; /* deliver only exact match when indicated */ if (likely(!deliver_exact)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &orig_dev->ptype_specific); if (unlikely(skb->dev != orig_dev)) { deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &skb->dev->ptype_specific); } if (pt_prev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; *ppt_prev = pt_prev; } else { drop: if (!deliver_exact) atomic_long_inc(&skb->dev->rx_dropped); else atomic_long_inc(&skb->dev->rx_nohandler); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ ret = NET_RX_DROP; } out: /* The invariant here is that if *ppt_prev is not NULL * then skb should also be non-NULL. * * Apparently *ppt_prev assignment above holds this invariant due to * skb dereferencing near it. */ *pskb = skb; return ret; } static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; int ret; ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (pt_prev) ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); return ret; } /** * netif_receive_skb_core - special purpose version of netif_receive_skb * @skb: buffer to process * * More direct receive version of netif_receive_skb(). It should * only be used by callers that have a need to skip RPS and Generic XDP. * Caller must also take care of handling if (page_is_)pfmemalloc. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb_core(struct sk_buff *skb) { int ret; rcu_read_lock(); ret = __netif_receive_skb_one_core(skb, false); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(netif_receive_skb_core); static inline void __netif_receive_skb_list_ptype(struct list_head *head, struct packet_type *pt_prev, struct net_device *orig_dev) { struct sk_buff *skb, *next; if (!pt_prev) return; if (list_empty(head)) return; if (pt_prev->list_func != NULL) pt_prev->list_func(head, pt_prev, orig_dev); else list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } } static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc) { /* Fast-path assumptions: * - There is no RX handler. * - Only one packet_type matches. * If either of these fails, we will end up doing some per-packet * processing in-line, then handling the 'last ptype' for the whole * sublist. This can't cause out-of-order delivery to any single ptype, * because the 'last ptype' must be constant across the sublist, and all * other ptypes are handled per-packet. */ /* Current (common) ptype of sublist */ struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; struct list_head sublist; struct sk_buff *skb, *next; INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; skb_list_del_init(skb); __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev); if (!pt_prev) continue; if (pt_curr != pt_prev || od_curr != orig_dev) { /* dispatch old sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); /* start new sublist */ INIT_LIST_HEAD(&sublist); pt_curr = pt_prev; od_curr = orig_dev; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr); } static int __netif_receive_skb(struct sk_buff *skb) { int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should * - be delivered to SOCK_MEMALLOC sockets only * - stay away from userspace * - have bounded memory usage * * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_one_core(skb, true); memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_one_core(skb, false); return ret; } static void __netif_receive_skb_list(struct list_head *head) { unsigned long noreclaim_flag = 0; struct sk_buff *skb, *next; bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */ list_for_each_entry_safe(skb, next, head, list) { if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) { struct list_head sublist; /* Handle the previous sublist */ list_cut_before(&sublist, head, &skb->list); if (!list_empty(&sublist)) __netif_receive_skb_list_core(&sublist, pfmemalloc); pfmemalloc = !pfmemalloc; /* See comments in __netif_receive_skb */ if (pfmemalloc) noreclaim_flag = memalloc_noreclaim_save(); else memalloc_noreclaim_restore(noreclaim_flag); } } /* Handle the remaining sublist */ if (!list_empty(head)) __netif_receive_skb_list_core(head, pfmemalloc); /* Restore pflags */ if (pfmemalloc) memalloc_noreclaim_restore(noreclaim_flag); } static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) { struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; int ret = 0; switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); if (old) bpf_prog_put(old); if (old && !new) { static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); dev_disable_gro_hw(dev); } break; case XDP_QUERY_PROG: xdp->prog_id = old ? old->aux->id : 0; break; default: ret = -EINVAL; break; } return ret; } static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; rcu_read_lock(); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); rcu_read_unlock(); return ret; } } #endif ret = __netif_receive_skb(skb); rcu_read_unlock(); return ret; } static void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; struct list_head sublist; INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); } list_splice_init(&sublist, head); rcu_read_lock(); #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { list_for_each_entry_safe(skb, next, head, list) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu = get_rps_cpu(skb->dev, skb, &rflow); if (cpu >= 0) { /* Will be handled, remove from list */ skb_list_del_init(skb); enqueue_to_backlog(skb, cpu, &rflow->last_qtail); } } } #endif __netif_receive_skb_list(head); rcu_read_unlock(); } /** * netif_receive_skb - process receive buffer from network * @skb: buffer to process * * netif_receive_skb() is the main receive data processing function. * It always succeeds. The buffer may be dropped during processing * for congestion control or by the protocol layers. * * This function may only be called from softirq context and interrupts * should be enabled. * * Return values (usually ignored): * NET_RX_SUCCESS: no congestion * NET_RX_DROP: packet was dropped */ int netif_receive_skb(struct sk_buff *skb) { trace_netif_receive_skb_entry(skb); return netif_receive_skb_internal(skb); } EXPORT_SYMBOL(netif_receive_skb); /** * netif_receive_skb_list - process many receive buffers from network * @head: list of skbs to process. * * Since return value of netif_receive_skb() is normally ignored, and * wouldn't be meaningful for a list, this function returns void. * * This function may only be called from softirq context and interrupts * should be enabled. */ void netif_receive_skb_list(struct list_head *head) { struct sk_buff *skb; if (list_empty(head)) return; list_for_each_entry(skb, head, list) trace_netif_receive_skb_list_entry(skb); netif_receive_skb_list_internal(head); } EXPORT_SYMBOL(netif_receive_skb_list); DEFINE_PER_CPU(struct work_struct, flush_works); /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; struct softnet_data *sd; local_bh_disable(); sd = this_cpu_ptr(&softnet_data); local_irq_disable(); rps_lock(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); dev_kfree_skb_irq(skb); input_queue_head_incr(sd); } } rps_unlock(sd); local_irq_enable(); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); input_queue_head_incr(sd); } } local_bh_enable(); } static void flush_all_backlogs(void) { unsigned int cpu; get_online_cpus(); for_each_online_cpu(cpu) queue_work_on(cpu, system_highpri_wq, per_cpu_ptr(&flush_works, cpu)); for_each_online_cpu(cpu) flush_work(per_cpu_ptr(&flush_works, cpu)); put_online_cpus(); } static int napi_gro_complete(struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); if (NAPI_GRO_CB(skb)->count == 1) { skb_shinfo(skb)->gso_size = 0; goto out; } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; err = ptype->callbacks.gro_complete(skb, 0); break; } rcu_read_unlock(); if (err) { WARN_ON(&ptype->list == head); kfree_skb(skb); return NET_RX_SUCCESS; } out: return netif_receive_skb_internal(skb); } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, bool flush_old) { struct list_head *head = &napi->gro_hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; list_del(&skb->list); skb->next = NULL; napi_gro_complete(skb); napi->gro_hash[index].count--; } if (!napi->gro_hash[index].count) __clear_bit(index, &napi->gro_bitmask); } /* napi->gro_hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { u32 i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { if (test_bit(i, &napi->gro_bitmask)) __napi_gro_flush_chain(napi, i, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); static struct list_head *gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); struct list_head *head; struct sk_buff *p; head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; list_for_each_entry(p, head, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), skb_mac_header(skb), maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } return head; } static void skb_gro_reset_offset(struct sk_buff *skb) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; if (skb_mac_header(skb) == skb_tail_pointer(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), skb->end - skb->tail); } } static void gro_pull_from_frag0(struct sk_buff *skb, int grow) { struct skb_shared_info *pinfo = skb_shinfo(skb); BUG_ON(skb->end - skb->tail < grow); memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); skb->data_len -= grow; skb->tail += grow; pinfo->frags[0].page_offset += grow; skb_frag_size_sub(&pinfo->frags[0], grow); if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { skb_frag_unref(skb, 0); memmove(pinfo->frags, pinfo->frags + 1, --pinfo->nr_frags * sizeof(pinfo->frags[0])); } } static void gro_flush_oldest(struct list_head *head) { struct sk_buff *oldest; oldest = list_last_entry(head, struct sk_buff, list); /* We are called with head length >= MAX_GRO_SKBS, so this is * impossible. */ if (WARN_ON_ONCE(!oldest)) return; /* Do not adjust napi->gro_hash[].count, caller is adding a new * SKB to the chain. */ list_del(&oldest->list); oldest->next = NULL; napi_gro_complete(oldest); } static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct list_head *gro_head; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; int grow; if (netif_elide_gro(skb->dev)) goto normal; gro_head = gro_list_prepare(napi, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) continue; skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->encap_mark = 0; NAPI_GRO_CB(skb)->recursion_counter = 0; NAPI_GRO_CB(skb)->is_fou = 0; NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ switch (skb->ip_summed) { case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; NAPI_GRO_CB(skb)->csum_cnt = 0; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; NAPI_GRO_CB(skb)->csum_valid = 0; break; default: NAPI_GRO_CB(skb)->csum_cnt = 0; NAPI_GRO_CB(skb)->csum_valid = 0; } pp = ptype->callbacks.gro_receive(gro_head, skb); break; } rcu_read_unlock(); if (&ptype->list == head) goto normal; if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) { ret = GRO_CONSUMED; goto ok; } same_flow = NAPI_GRO_CB(skb)->same_flow; ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { list_del(&pp->list); pp->next = NULL; napi_gro_complete(pp); napi->gro_hash[hash].count--; } if (same_flow) goto ok; if (NAPI_GRO_CB(skb)->flush) goto normal; if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { gro_flush_oldest(gro_head); } else { napi->gro_hash[hash].count++; } NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, gro_head); ret = GRO_HELD; pull: grow = skb_gro_offset(skb) - skb_headlen(skb); if (grow > 0) gro_pull_from_frag0(skb, grow); ok: if (napi->gro_hash[hash].count) { if (!test_bit(hash, &napi->gro_bitmask)) __set_bit(hash, &napi->gro_bitmask); } else if (test_bit(hash, &napi->gro_bitmask)) { __clear_bit(hash, &napi->gro_bitmask); } return ret; normal: ret = GRO_NORMAL; goto pull; } struct packet_offload *gro_find_receive_by_type(__be16 type) { struct list_head *offload_head = &offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { struct list_head *offload_head = &offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_complete_by_type); static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); secpath_reset(skb); kmem_cache_free(skbuff_head_cache, skb); } static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { case GRO_NORMAL: if (netif_receive_skb_internal(skb)) ret = GRO_DROP; break; case GRO_DROP: kfree_skb(skb); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else __kfree_skb(skb); break; case GRO_HELD: case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb); return napi_skb_finish(dev_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { if (unlikely(skb->pfmemalloc)) { consume_skb(skb); return; } __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; /* eth_type_trans() assumes pkt_type is PACKET_HOST */ skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); secpath_reset(skb); napi->skb = skb; } struct sk_buff *napi_get_frags(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); if (skb) { napi->skb = skb; skb_mark_napi_id(skb, napi); } } return skb; } EXPORT_SYMBOL(napi_get_frags); static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: case GRO_HELD: __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) ret = GRO_DROP; break; case GRO_DROP: napi_reuse_skb(napi, skb); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else napi_reuse_skb(napi, skb); break; case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } /* Upper GRO stack assumes network header starts at gro_offset=0 * Drivers could call both napi_gro_frags() and napi_gro_receive() * We copy ethernet header into skb->data to have a common layout. */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; const struct ethhdr *eth; unsigned int hlen = sizeof(*eth); napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { net_warn_ratelimited("%s: dropping impossible skb from %s\n", __func__, napi->dev->name); napi_reuse_skb(napi, skb); return NULL; } } else { eth = (const struct ethhdr *)skb->data; gro_pull_from_frag0(skb, hlen); NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; } __skb_pull(skb, hlen); /* * This works because the only protocols we care about don't require * special handling. * We'll fix it up properly in napi_frags_finish() */ skb->protocol = eth->h_proto; return skb; } gro_result_t napi_gro_frags(struct napi_struct *napi) { struct sk_buff *skb = napi_frags_skb(napi); if (!skb) return GRO_DROP; trace_napi_gro_frags_entry(skb); return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); } EXPORT_SYMBOL(napi_gro_frags); /* Compute the checksum from gro_offset and return the folded value * after adding in any pseudo checksum. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) { __wsum wsum; __sum16 sum; wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev); } NAPI_GRO_CB(skb)->csum = wsum; NAPI_GRO_CB(skb)->csum_valid = 1; return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete); static void net_rps_send_ipi(struct softnet_data *remsd) { #ifdef CONFIG_RPS while (remsd) { struct softnet_data *next = remsd->rps_ipi_next; if (cpu_online(remsd->cpu)) smp_call_function_single_async(remsd->cpu, &remsd->csd); remsd = next; } #endif } /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. */ static void net_rps_action_and_irq_enable(struct softnet_data *sd) { #ifdef CONFIG_RPS struct softnet_data *remsd = sd->rps_ipi_list; if (remsd) { sd->rps_ipi_list = NULL; local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ net_rps_send_ipi(remsd); } else #endif local_irq_enable(); } static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) { #ifdef CONFIG_RPS return sd->rps_ipi_list != NULL; #else return false; #endif } static int process_backlog(struct napi_struct *napi, int quota) { struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); bool again = true; int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. */ if (sd_has_rps_ipi_waiting(sd)) { local_irq_disable(); net_rps_action_and_irq_enable(sd); } napi->weight = READ_ONCE(dev_rx_weight); while (again) { struct sk_buff *skb; while ((skb = __skb_dequeue(&sd->process_queue))) { rcu_read_lock(); __netif_receive_skb(skb); rcu_read_unlock(); input_queue_head_incr(sd); if (++work >= quota) return work; } local_irq_disable(); rps_lock(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* * Inline a custom version of __napi_complete(). * only current cpu owns and manipulates this napi, * and NAPI_STATE_SCHED is the only possible flag set * on backlog. * We can use a plain write instead of clear_bit(), * and we dont need an smp_mb() memory barrier. */ napi->state = 0; again = false; } else { skb_queue_splice_tail_init(&sd->input_pkt_queue, &sd->process_queue); } rps_unlock(sd); local_irq_enable(); } return work; } /** * __napi_schedule - schedule for receive * @n: entry to schedule * * The entry's receive function will be scheduled to run. * Consider using __napi_schedule_irqoff() if hard irqs are masked. */ void __napi_schedule(struct napi_struct *n) { unsigned long flags; local_irq_save(flags); ____napi_schedule(this_cpu_ptr(&softnet_data), n); local_irq_restore(flags); } EXPORT_SYMBOL(__napi_schedule); /** * napi_schedule_prep - check if napi can be scheduled * @n: napi context * * Test if NAPI routine is already running, and if not mark * it as running. This is used as a condition variable * insure only one NAPI poll instance runs. We also make * sure there is no pending NAPI disable. */ bool napi_schedule_prep(struct napi_struct *n) { unsigned long val, new; do { val = READ_ONCE(n->state); if (unlikely(val & NAPIF_STATE_DISABLE)) return false; new = val | NAPIF_STATE_SCHED; /* Sets STATE_MISSED bit if STATE_SCHED was already set * This was suggested by Alexander Duyck, as compiler * emits better code than : * if (val & NAPIF_STATE_SCHED) * new |= NAPIF_STATE_MISSED; */ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED; } while (cmpxchg(&n->state, val, new) != val); return !(val & NAPIF_STATE_SCHED); } EXPORT_SYMBOL(napi_schedule_prep); /** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * * Variant of __napi_schedule() assuming hard irqs are masked. * * On PREEMPT_RT enabled kernels this maps to __napi_schedule() * because the interrupt disabled assumption might not be true * due to force-threaded interrupts and spinlock substitution. */ void __napi_schedule_irqoff(struct napi_struct *n) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ____napi_schedule(this_cpu_ptr(&softnet_data), n); else __napi_schedule(n); } EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { unsigned long flags, val, new; /* * 1) Don't let napi dequeue from the cpu poll list * just in case its running on a different cpu. * 2) If we are busy polling, do nothing here, we have * the guarantee we will be called later. */ if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL))) return false; if (n->gro_bitmask) { unsigned long timeout = 0; if (work_done) timeout = n->dev->gro_flush_timeout; /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); if (timeout) hrtimer_start(&n->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); } if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); list_del_init(&n->poll_list); local_irq_restore(flags); } do { val = READ_ONCE(n->state); WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); /* If STATE_MISSED was set, leave STATE_SCHED set, * because we will call napi->poll() one more time. * This C code was suggested by Alexander Duyck to help gcc. */ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NAPIF_STATE_SCHED; } while (cmpxchg(&n->state, val, new) != val); if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); return false; } return true; } EXPORT_SYMBOL(napi_complete_done); /* must be called under rcu_read_lock(), as we dont take a reference */ static struct napi_struct *napi_by_id(unsigned int napi_id) { unsigned int hash = napi_id % HASH_SIZE(napi_hash); struct napi_struct *napi; hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) if (napi->napi_id == napi_id) return napi; return NULL; } #if defined(CONFIG_NET_RX_BUSY_POLL) #define BUSY_POLL_BUDGET 8 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) { int rc; /* Busy polling means there is a high chance device driver hard irq * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was * set in napi_schedule_prep(). * Since we are about to call napi->poll() once more, we can safely * clear NAPI_STATE_MISSED. * * Note: x86 could use a single "lock and ..." instruction * to perform these two clear_bit() */ clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); /* All we really want here is to re-enable device interrupts. * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, BUSY_POLL_BUDGET); trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); if (rc == BUSY_POLL_BUDGET) __napi_schedule(napi); local_bh_enable(); } void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; restart: napi_poll = NULL; rcu_read_lock(); napi = napi_by_id(napi_id); if (!napi) goto out; preempt_disable(); for (;;) { int work = 0; local_bh_disable(); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); /* If multiple threads are competing for this napi, * we avoid dirtying napi->state as much as we can. */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) goto count; if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) goto count; have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } work = napi_poll(napi, BUSY_POLL_BUDGET); trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { if (napi_poll) busy_poll_stop(napi, have_poll_lock); preempt_enable(); rcu_read_unlock(); cond_resched(); if (loop_end(loop_end_arg, start_time)) return; goto restart; } cpu_relax(); } if (napi_poll) busy_poll_stop(napi, have_poll_lock); preempt_enable(); out: rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ static void napi_hash_add(struct napi_struct *napi) { if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) return; spin_lock(&napi_hash_lock); /* 0..NR_CPUS range is reserved for sender_cpu use */ do { if (unlikely(++napi_gen_id < MIN_NAPI_ID)) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); napi->napi_id = napi_gen_id; hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); spin_unlock(&napi_hash_lock); } /* Warning : caller is responsible to make sure rcu grace period * is respected before freeing memory containing @napi */ bool napi_hash_del(struct napi_struct *napi) { bool rcu_sync_needed = false; spin_lock(&napi_hash_lock); if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { rcu_sync_needed = true; hlist_del_rcu(&napi->napi_hash_node); } spin_unlock(&napi_hash_lock); return rcu_sync_needed; } EXPORT_SYMBOL_GPL(napi_hash_del); static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) { struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ if (napi->gro_bitmask && !napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); return HRTIMER_NORESTART; } static void init_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { INIT_LIST_HEAD(&napi->gro_hash[i].list); napi->gro_hash[i].count = 0; } napi->gro_bitmask = 0; } void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { INIT_LIST_HEAD(&napi->poll_list); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; init_gro_hash(napi); napi->skb = NULL; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) pr_err_once("netif_napi_add() called with weight %d on device %s\n", weight, dev->name); napi->weight = weight; napi->dev = dev; #ifdef CONFIG_NETPOLL napi->poll_owner = -1; #endif set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); } EXPORT_SYMBOL(netif_napi_add); void napi_disable(struct napi_struct *n) { might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) msleep(1); hrtimer_cancel(&n->timer); clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); static void flush_gro_hash(struct napi_struct *napi) { int i; for (i = 0; i < GRO_HASH_BUCKETS; i++) { struct sk_buff *skb, *n; list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) kfree_skb(skb); napi->gro_hash[i].count = 0; } } /* Must be called in process context */ void netif_napi_del(struct napi_struct *napi) { might_sleep(); if (napi_hash_del(napi)) synchronize_net(); list_del_init(&napi->dev_list); napi_free_frags(napi); flush_gro_hash(napi); napi->gro_bitmask = 0; } EXPORT_SYMBOL(netif_napi_del); static int napi_poll(struct napi_struct *n, struct list_head *repoll) { void *have; int work, weight; list_del_init(&n->poll_list); have = netpoll_poll_lock(n); weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race * with netpoll's poll_napi(). Only the entity which * obtains the lock and sees NAPI_STATE_SCHED set will * actually make the ->poll() call. Therefore we avoid * accidentally calling ->poll() when NAPI is not scheduled. */ work = 0; if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); trace_napi_poll(n, work, weight); } WARN_ON_ONCE(work > weight); if (likely(work < weight)) goto out_unlock; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code * still "owns" the NAPI instance and therefore can * move the instance around on the list at-will. */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); goto out_unlock; } if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. */ napi_gro_flush(n, HZ >= 1000); } /* Some drivers may have called napi_schedule * prior to exhausting their budget. */ if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); goto out_unlock; } list_add_tail(&n->poll_list, repoll); out_unlock: netpoll_poll_unlock(have); return work; } static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); int budget = READ_ONCE(netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); local_irq_disable(); list_splice_init(&sd->poll_list, &list); local_irq_enable(); for (;;) { struct napi_struct *n; if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) goto out; break; } n = list_first_entry(&list, struct napi_struct, poll_list); budget -= napi_poll(n, &repoll); /* If softirq window is exhausted then punt. * Allow this to run for 2 jiffies since which will allow * an average latency of 1.5/HZ. */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { sd->time_squeeze++; break; } } local_irq_disable(); list_splice_tail_init(&sd->poll_list, &list); list_splice_tail(&repoll, &list); list_splice(&list, &sd->poll_list); if (!list_empty(&sd->poll_list)) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); out: __kfree_skb_flush(); } struct netdev_adjacent { struct net_device *dev; /* upper master flag, there can only be one master device per list */ bool master; /* counter for the number of times this device was added to us */ u16 ref_nr; /* private field for the users */ void *private; struct list_head list; struct rcu_head rcu; }; static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, struct list_head *adj_list) { struct netdev_adjacent *adj; list_for_each_entry(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data) { struct net_device *dev = data; return upper_dev == dev; } /** * netdev_has_upper_dev - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks only immediate upper device, * not through a complete stack of devices. The caller must hold the RTNL lock. */ bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev) { ASSERT_RTNL(); return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev); /** * netdev_has_upper_dev_all - Check if device is linked to an upper device * @dev: device * @upper_dev: upper device to check * * Find out if a device is linked to specified upper device and return true * in case it is. Note that this checks the entire upper device chain. * The caller must hold rcu lock. */ bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev) { return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev, upper_dev); } EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); /** * netdev_has_any_upper_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RTNL lock. */ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) { struct netdev_adjacent *upper; ASSERT_RTNL(); if (list_empty(&dev->adj_list.upper)) return NULL; upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get); /** * netdev_has_any_lower_dev - Check if device is linked to some device * @dev: device * * Find out if a device is linked to a lower device and return true in case * it is. The caller must hold the RTNL lock. */ static bool netdev_has_any_lower_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.lower); } void *netdev_adjacent_get_private(struct list_head *adj_list) { struct netdev_adjacent *adj; adj = list_entry(adj_list, struct netdev_adjacent, list); return adj->private; } EXPORT_SYMBOL(netdev_adjacent_get_private); /** * netdev_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); static struct net_device *netdev_next_upper_dev(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; upper = list_entry((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *upper; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&upper->list == &dev->adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } static int netdev_walk_all_upper_dev(struct net_device *dev, int (*fn)(struct net_device *dev, void *data), void *data) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, data); if (ret) return ret; } next = NULL; while (1) { udev = netdev_next_upper_dev(now, &iter); if (!udev) break; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } int netdev_walk_all_upper_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, void *data), void *data) { struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.upper; while (1) { if (now != dev) { ret = fn(now, data); if (ret) return ret; } next = NULL; while (1) { udev = netdev_next_upper_dev_rcu(now, &iter); if (!udev) break; next = udev; niter = &udev->adj_list.upper; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu); /** * netdev_lower_get_next_private - Get the next ->private from the * lower neighbour list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private); /** * netdev_lower_get_next_private_rcu - Get the next ->private from the * lower neighbour list, RCU * variant * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold RCU read lock. */ void *netdev_lower_get_next_private_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; WARN_ON_ONCE(!rcu_read_lock_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->private; } EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** * netdev_lower_get_next - Get the next device from the lower neighbour * list * @dev: device * @iter: list_head ** of the current position * * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry(*iter, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = lower->list.next; return lower->dev; } EXPORT_SYMBOL(netdev_lower_get_next); static struct net_device *netdev_next_lower_dev(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *dev, void *data), void *data) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, data); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev); static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter) { struct netdev_adjacent *lower; lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); if (&lower->list == &dev->adj_list.lower) return NULL; *iter = &lower->list; return lower->dev; } static u8 __netdev_upper_depth(struct net_device *dev) { struct net_device *udev; struct list_head *iter; u8 max_depth = 0; for (iter = &dev->adj_list.upper, udev = netdev_next_upper_dev(dev, &iter); udev; udev = netdev_next_upper_dev(dev, &iter)) { if (max_depth < udev->upper_level) max_depth = udev->upper_level; } return max_depth; } static u8 __netdev_lower_depth(struct net_device *dev) { struct net_device *ldev; struct list_head *iter; u8 max_depth = 0; for (iter = &dev->adj_list.lower, ldev = netdev_next_lower_dev(dev, &iter); ldev; ldev = netdev_next_lower_dev(dev, &iter)) { if (max_depth < ldev->lower_level) max_depth = ldev->lower_level; } return max_depth; } static int __netdev_update_upper_level(struct net_device *dev, void *data) { dev->upper_level = __netdev_upper_depth(dev) + 1; return 0; } static int __netdev_update_lower_level(struct net_device *dev, void *data) { dev->lower_level = __netdev_lower_depth(dev) + 1; return 0; } int netdev_walk_all_lower_dev_rcu(struct net_device *dev, int (*fn)(struct net_device *dev, void *data), void *data) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int ret, cur = 0; now = dev; iter = &dev->adj_list.lower; while (1) { if (now != dev) { ret = fn(now, data); if (ret) return ret; } next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; break; } if (!next) { if (!cur) return 0; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return 0; } EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu); /** * netdev_lower_get_first_private_rcu - Get the first ->private from the * lower neighbour list, RCU * variant * @dev: device * * Gets the first netdev_adjacent->private from the dev's lower neighbour * list. The caller must hold RCU read lock. */ void *netdev_lower_get_first_private_rcu(struct net_device *dev) { struct netdev_adjacent *lower; lower = list_first_or_null_rcu(&dev->adj_list.lower, struct netdev_adjacent, list); if (lower) return lower->private; return NULL; } EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device * @dev: device * * Find a master upper device and return pointer to it or NULL in case * it's not there. The caller must hold the RCU read lock. */ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; return NULL; } EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int netdev_adjacent_sysfs_add(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", adj_dev->name); return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), linkname); } static void netdev_adjacent_sysfs_del(struct net_device *dev, char *name, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", name); sysfs_remove_link(&(dev->dev.kobj), linkname); } static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list) { return (dev_list == &dev->adj_list.upper || dev_list == &dev->adj_list.lower) && net_eq(dev_net(dev), dev_net(adj_dev)); } static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, struct list_head *dev_list, void *private, bool master) { struct netdev_adjacent *adj; int ret; adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { adj->ref_nr += 1; pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n", dev->name, adj_dev->name, adj->ref_nr); return 0; } adj = kmalloc(sizeof(*adj), GFP_KERNEL); if (!adj) return -ENOMEM; adj->dev = adj_dev; adj->master = master; adj->ref_nr = 1; adj->private = private; dev_hold(adj_dev); pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n", dev->name, adj_dev->name, adj->ref_nr, adj_dev->name); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); if (ret) goto free_adj; } /* Ensure that master link is always the first item in list. */ if (master) { ret = sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), "master"); if (ret) goto remove_symlinks; list_add_rcu(&adj->list, dev_list); } else { list_add_tail_rcu(&adj->list, dev_list); } return 0; remove_symlinks: if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); free_adj: kfree(adj); dev_put(adj_dev); return ret; } static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n", dev->name, adj_dev->name, ref_nr); adj = __netdev_find_adj(adj_dev, dev_list); if (!adj) { pr_err("Adjacency does not exist for device %s from %s\n", dev->name, adj_dev->name); WARN_ON(1); return; } if (adj->ref_nr > ref_nr) { pr_debug("adjacency: %s to %s ref_nr - %d = %d\n", dev->name, adj_dev->name, ref_nr, adj->ref_nr - ref_nr); adj->ref_nr -= ref_nr; return; } if (adj->master) sysfs_remove_link(&(dev->dev.kobj), "master"); if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); list_del_rcu(&adj->list); pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n", adj_dev->name, dev->name, adj_dev->name); dev_put(adj_dev); kfree_rcu(adj, rcu); } static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, master); if (ret) return ret; ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, false); if (ret) { __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list); return ret; } return 0; } static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { return __netdev_adjacent_dev_link_lists(dev, upper_dev, &dev->adj_list.upper, &upper_dev->adj_list.lower, private, master); } static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, .extack = extack, }, .upper_dev = upper_dev, .master = master, .linking = true, .upper_info = upper_info, }; struct net_device *master_dev; int ret = 0; ASSERT_RTNL(); if (dev == upper_dev) return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ if (netdev_has_upper_dev(upper_dev, dev)) return -EBUSY; if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV) return -EMLINK; if (!master) { if (netdev_has_upper_dev(dev, upper_dev)) return -EEXIST; } else { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) return master_dev == upper_dev ? -EEXIST : -EBUSY; } ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) return ret; ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, master); if (ret) return ret; ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) goto rollback; __netdev_update_upper_level(dev, NULL); netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, NULL); netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL); return 0; rollback: __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } /** * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. * On success the reference counts are adjusted and the function * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, struct netlink_ext_ack *extack) { return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); /** * netdev_master_upper_dev_link - Add a master link to the upper device * @dev: device * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices * might be linked as well. The caller must hold the RTNL lock. * On a failure a negative errno code is returned. On success the reference * counts are adjusted and the function returns zero. */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, void *upper_priv, void *upper_info, struct netlink_ext_ack *extack) { return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv, upper_info, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device * @upper_dev: new upper device * * Removes a link to device which is upper to this one. The caller must hold * the RTNL lock. */ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_notifier_changeupper_info changeupper_info = { .info = { .dev = dev, }, .upper_dev = upper_dev, .linking = false, }; ASSERT_RTNL(); changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); __netdev_update_upper_level(dev, NULL); netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL); __netdev_update_lower_level(upper_dev, NULL); netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level, NULL); } EXPORT_SYMBOL(netdev_upper_dev_unlink); /** * netdev_bonding_info_change - Dispatch event about slave change * @dev: device * @bonding_info: info to dispatch * * Send NETDEV_BONDING_INFO to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { struct netdev_notifier_bonding_info info = { .info.dev = dev, }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(dev, iter->dev, &dev->adj_list.lower); } } static void netdev_adjacent_del_links(struct net_device *dev) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.upper); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, dev->name, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_del(dev, iter->dev->name, &dev->adj_list.lower); } } void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) { struct netdev_adjacent *iter; struct net *net = dev_net(dev); list_for_each_entry(iter, &dev->adj_list.upper, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.lower); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.lower); } list_for_each_entry(iter, &dev->adj_list.lower, list) { if (!net_eq(net, dev_net(iter->dev))) continue; netdev_adjacent_sysfs_del(iter->dev, oldname, &iter->dev->adj_list.upper); netdev_adjacent_sysfs_add(iter->dev, dev, &iter->dev->adj_list.upper); } } void *netdev_lower_dev_get_private(struct net_device *dev, struct net_device *lower_dev) { struct netdev_adjacent *lower; if (!lower_dev) return NULL; lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); if (!lower) return NULL; return lower->private; } EXPORT_SYMBOL(netdev_lower_dev_get_private); int dev_get_nest_level(struct net_device *dev) { struct net_device *lower = NULL; struct list_head *iter; int max_nest = -1; int nest; ASSERT_RTNL(); netdev_for_each_lower_dev(dev, lower, iter) { nest = dev_get_nest_level(lower); if (max_nest < nest) max_nest = nest; } return max_nest + 1; } EXPORT_SYMBOL(dev_get_nest_level); /** * netdev_lower_change - Dispatch event about lower device state change * @lower_dev: device * @lower_state_info: state to dispatch * * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. * The caller must hold the RTNL lock. */ void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { struct netdev_notifier_changelowerstate_info changelowerstate_info = { .info.dev = lower_dev, }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_rx_flags) ops->ndo_change_rx_flags(dev, flags); } static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; kuid_t uid; kgid_t gid; ASSERT_RTNL(); dev->flags |= IFF_PROMISC; dev->promiscuity += inc; if (dev->promiscuity == 0) { /* * Avoid overflow. * If inc causes overflow, untouch promisc and return error. */ if (inc < 0) dev->flags &= ~IFF_PROMISC; else { dev->promiscuity -= inc; pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", dev->name); return -EOVERFLOW; } } if (dev->flags != old_flags) { pr_info("device %s %s promiscuous mode\n", dev->name, dev->flags & IFF_PROMISC ? "entered" : "left"); if (audit_enabled) { current_uid_gid(&uid, &gid); audit_log(audit_context(), GFP_ATOMIC, AUDIT_ANOM_PROMISCUOUS, "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), from_kuid(&init_user_ns, audit_get_loginuid(current)), from_kuid(&init_user_ns, uid), from_kgid(&init_user_ns, gid), audit_get_sessionid(current)); } dev_change_rx_flags(dev, IFF_PROMISC); } if (notify) __dev_notify_flags(dev, old_flags, IFF_PROMISC); return 0; } /** * dev_set_promiscuity - update promiscuity count on a device * @dev: device * @inc: modifier * * Add or remove promiscuity from a device. While the count in the device * remains above zero the interface remains promiscuous. Once it hits zero * the device reverts back to normal filtering operation. A negative inc * value is used to drop promiscuity on the device. * Return 0 if successful or a negative errno code on error. */ int dev_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) dev_set_rx_mode(dev); return err; } EXPORT_SYMBOL(dev_set_promiscuity); static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ASSERT_RTNL(); dev->flags |= IFF_ALLMULTI; dev->allmulti += inc; if (dev->allmulti == 0) { /* * Avoid overflow. * If inc causes overflow, untouch allmulti and return error. */ if (inc < 0) dev->flags &= ~IFF_ALLMULTI; else { dev->allmulti -= inc; pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", dev->name); return -EOVERFLOW; } } if (dev->flags ^ old_flags) { dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); if (notify) __dev_notify_flags(dev, old_flags, dev->gflags ^ old_gflags); } return 0; } /** * dev_set_allmulti - update allmulti count on a device * @dev: device * @inc: modifier * * Add or remove reception of all multicast frames to a device. While the * count in the device remains above zero the interface remains listening * to all interfaces. Once it hits zero the device reverts back to normal * filtering operation. A negative @inc value is used to drop the counter * when releasing a resource needing all multicasts. * Return 0 if successful or a negative errno code on error. */ int dev_set_allmulti(struct net_device *dev, int inc) { return __dev_set_allmulti(dev, inc, true); } EXPORT_SYMBOL(dev_set_allmulti); /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast * filtering it is put in promiscuous mode while unicast addresses * are present. */ void __dev_set_rx_mode(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return; if (!netif_device_present(dev)) return; if (!(dev->priv_flags & IFF_UNICAST_FLT)) { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } if (ops->ndo_set_rx_mode) ops->ndo_set_rx_mode(dev); } void dev_set_rx_mode(struct net_device *dev) { netif_addr_lock_bh(dev); __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); } /** * dev_get_flags - get flags reported to userspace * @dev: device * * Get the combination of flag bits exported through APIs to userspace. */ unsigned int dev_get_flags(const struct net_device *dev) { unsigned int flags; flags = (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI | IFF_RUNNING | IFF_LOWER_UP | IFF_DORMANT)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); if (netif_running(dev)) { if (netif_oper_up(dev)) flags |= IFF_RUNNING; if (netif_carrier_ok(dev)) flags |= IFF_LOWER_UP; if (netif_dormant(dev)) flags |= IFF_DORMANT; } return flags; } EXPORT_SYMBOL(dev_get_flags); int __dev_change_flags(struct net_device *dev, unsigned int flags) { unsigned int old_flags = dev->flags; int ret; ASSERT_RTNL(); /* * Set the flags on our device. */ dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | IFF_AUTOMEDIA)) | (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI)); /* * Load in the correct multicast list now the flags have changed. */ if ((old_flags ^ flags) & IFF_MULTICAST) dev_change_rx_flags(dev, IFF_MULTICAST); dev_set_rx_mode(dev); /* * Have we downed the interface. We handle IFF_UP ourselves * according to user attempts to set it, rather than blindly * setting it. */ ret = 0; if ((old_flags ^ flags) & IFF_UP) { if (old_flags & IFF_UP) __dev_close(dev); else ret = __dev_open(dev); } if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; if (__dev_set_promiscuity(dev, inc, false) >= 0) if (dev->flags != old_flags) dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI * is important. Some (broken) drivers set IFF_PROMISC, when * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; __dev_set_allmulti(dev, inc, false); } return ret; } void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges) { unsigned int changes = dev->flags ^ old_flags; if (gchanges) rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); else call_netdevice_notifiers(NETDEV_DOWN, dev); } if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { struct netdev_notifier_change_info change_info = { .info = { .dev = dev, }, .flags_changed = changes, }; call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } /** * dev_change_flags - change device settings * @dev: device * @flags: device state flags * * Change settings on device based state flags. The flags are * in the userspace exported format. */ int dev_change_flags(struct net_device *dev, unsigned int flags) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags); if (ret < 0) return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); __dev_notify_flags(dev, old_flags, changes); return ret; } EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_change_mtu) return ops->ndo_change_mtu(dev, new_mtu); /* Pairs with all the lockless reads of dev->mtu in the stack */ WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(__dev_set_mtu); int dev_validate_mtu(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { /* MTU must be positive, and in range */ if (new_mtu < 0 || new_mtu < dev->min_mtu) { NL_SET_ERR_MSG(extack, "mtu less than device minimum"); return -EINVAL; } if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) { NL_SET_ERR_MSG(extack, "mtu greater than device maximum"); return -EINVAL; } return 0; } /** * dev_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, struct netlink_ext_ack *extack) { int err, orig_mtu; if (new_mtu == dev->mtu) return 0; err = dev_validate_mtu(dev, new_mtu, extack); if (err) return err; if (!netif_device_present(dev)) return -ENODEV; err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); err = notifier_to_errno(err); if (err) return err; orig_mtu = dev->mtu; err = __dev_set_mtu(dev, new_mtu); if (!err) { err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, orig_mtu); err = notifier_to_errno(err); if (err) { /* setting mtu back and notifying everyone again, * so that they have a chance to revert changes. */ __dev_set_mtu(dev, orig_mtu); call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, new_mtu); } } return err; } int dev_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); err = dev_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } EXPORT_SYMBOL(dev_set_mtu); /** * dev_change_tx_queue_len - Change TX queue length of a netdevice * @dev: device * @new_len: new tx queue length */ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; if (new_len != (unsigned int)new_len) return -ERANGE; if (new_len != orig_len) { dev->tx_queue_len = new_len; res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); res = notifier_to_errno(res); if (res) goto err_rollback; res = dev_qdisc_change_tx_queue_len(dev); if (res) goto err_rollback; } return 0; err_rollback: netdev_err(dev, "refused to change device tx_queue_len\n"); dev->tx_queue_len = orig_len; return res; } /** * dev_set_group - Change group this device belongs to * @dev: device * @new_group: group this device should belong to */ void dev_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } EXPORT_SYMBOL(dev_set_group); /** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address * * Change the hardware (MAC) address of the device */ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (sa->sa_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; err = ops->ndo_set_mac_address(dev, sa); if (err) return err; dev->addr_assign_type = NET_ADDR_SET; call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } EXPORT_SYMBOL(dev_set_mac_address); /** * dev_change_carrier - Change device carrier * @dev: device * @new_carrier: new value * * Change device carrier */ int dev_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_change_carrier) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_change_carrier(dev, new_carrier); } EXPORT_SYMBOL(dev_change_carrier); /** * dev_get_phys_port_id - Get device physical port ID * @dev: device * @ppid: port ID * * Get device physical port ID */ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_phys_port_id) return -EOPNOTSUPP; return ops->ndo_get_phys_port_id(dev, ppid); } EXPORT_SYMBOL(dev_get_phys_port_id); /** * dev_get_phys_port_name - Get device physical port name * @dev: device * @name: port name * @len: limit of bytes to copy to name * * Get device physical port name */ int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_get_phys_port_name) return -EOPNOTSUPP; return ops->ndo_get_phys_port_name(dev, name, len); } EXPORT_SYMBOL(dev_get_phys_port_name); /** * dev_change_proto_down - update protocol port state information * @dev: device * @proto_down: new value * * This info can be used by switch drivers to set the phys state of the * port. */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { const struct net_device_ops *ops = dev->netdev_ops; if (!ops->ndo_change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; return ops->ndo_change_proto_down(dev, proto_down); } EXPORT_SYMBOL(dev_change_proto_down); u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, enum bpf_netdev_command cmd) { struct netdev_bpf xdp; if (!bpf_op) return 0; memset(&xdp, 0, sizeof(xdp)); xdp.command = cmd; /* Query must always succeed. */ WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG); return xdp.prog_id; } static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { struct netdev_bpf xdp; memset(&xdp, 0, sizeof(xdp)); if (flags & XDP_FLAGS_HW_MODE) xdp.command = XDP_SETUP_PROG_HW; else xdp.command = XDP_SETUP_PROG; xdp.extack = extack; xdp.flags = flags; xdp.prog = prog; return bpf_op(dev, &xdp); } static void dev_xdp_uninstall(struct net_device *dev) { struct netdev_bpf xdp; bpf_op_t ndo_bpf; /* Remove generic XDP */ WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); /* Remove from the driver */ ndo_bpf = dev->netdev_ops->ndo_bpf; if (!ndo_bpf) return; memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG; WARN_ON(ndo_bpf(dev, &xdp)); if (xdp.prog_id) WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); /* Remove HW offload */ memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG_HW; if (!ndo_bpf(dev, &xdp) && xdp.prog_id) WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); } /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; enum bpf_netdev_command query; struct bpf_prog *prog = NULL; bpf_op_t bpf_op, bpf_chk; int err; ASSERT_RTNL(); query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; bpf_op = bpf_chk = ops->ndo_bpf; if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE)) bpf_op = generic_xdp_install; if (bpf_op == bpf_chk) bpf_chk = generic_xdp_install; if (fd >= 0) { if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) || __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && __dev_xdp_query(dev, bpf_op, query)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, bpf_op == ops->ndo_bpf); if (IS_ERR(prog)) return PTR_ERR(prog); if (!(flags & XDP_FLAGS_HW_MODE) && bpf_prog_is_dev_bound(prog->aux)) { NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported"); bpf_prog_put(prog); return -EINVAL; } } err = dev_xdp_install(dev, bpf_op, extack, flags, prog); if (err < 0 && prog) bpf_prog_put(prog); return err; } /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace * * Returns a suitable unique value for a new device interface * number. The caller must hold the rtnl semaphore or the * dev_base_lock to be sure it remains unique. */ static int dev_new_index(struct net *net) { int ifindex = net->ifindex; for (;;) { if (++ifindex <= 0) ifindex = 1; if (!__dev_get_by_index(net, ifindex)) return net->ifindex = ifindex; } } /* Delayed registration/unregisteration */ static LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); dev_net(dev)->dev_unreg_count++; } static void rollback_registered_many(struct list_head *head) { struct net_device *dev, *tmp; LIST_HEAD(close_head); BUG_ON(dev_boot_phase); ASSERT_RTNL(); list_for_each_entry_safe(dev, tmp, head, unreg_list) { /* Some devices call without registering * for initialization unwind. Remove those * devices and proceed with the remaining. */ if (dev->reg_state == NETREG_UNINITIALIZED) { pr_debug("unregister_netdevice: device %s/%p never was registered\n", dev->name, dev); WARN_ON(1); list_del(&dev->unreg_list); continue; } dev->dismantle = true; BUG_ON(dev->reg_state != NETREG_REGISTERED); } /* If device is running, close it first. */ list_for_each_entry(dev, head, unreg_list) list_add_tail(&dev->close_list, &close_head); dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); dev->reg_state = NETREG_UNREGISTERING; } flush_all_backlogs(); synchronize_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ dev_shutdown(dev); dev_xdp_uninstall(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); WARN_ON(netdev_has_any_lower_dev(dev)); /* Remove entries from kobject tree */ netdev_unregister_kobject(dev); #ifdef CONFIG_XPS /* Remove XPS queueing entries */ netif_reset_xps_queues_gt(dev, 0); #endif } synchronize_net(); list_for_each_entry(dev, head, unreg_list) dev_put(dev); } static void rollback_registered(struct net_device *dev) { LIST_HEAD(single); list_add(&dev->unreg_list, &single); rollback_registered_many(&single); list_del(&single); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, struct net_device *upper, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(upper->wanted_features & feature) && (features & feature)) { netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", &feature, upper->name); features &= ~feature; } } return features; } static void netdev_sync_lower_features(struct net_device *upper, struct net_device *lower, netdev_features_t features) { netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; netdev_features_t feature; int feature_bit; for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); lower->wanted_features &= ~feature; __netdev_update_features(lower); if (unlikely(lower->features & feature)) netdev_WARN(upper, "failed to disable %pNF on %s!\n", &feature, lower->name); else netdev_features_change(lower); } } } static netdev_features_t netdev_fix_features(struct net_device *dev, netdev_features_t features) { /* Fix illegal checksum combinations */ if ((features & NETIF_F_HW_CSUM) && (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { netdev_warn(dev, "mixed HW and IP checksum settings.\n"); features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); } /* TSO requires that SG is present as well. */ if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); features &= ~NETIF_F_ALL_TSO; } if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IP_CSUM)) { netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); features &= ~NETIF_F_TSO; features &= ~NETIF_F_TSO_ECN; } if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && !(features & NETIF_F_IPV6_CSUM)) { netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); features &= ~NETIF_F_TSO6; } /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) features &= ~NETIF_F_TSO_MANGLEID; /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; /* Software GSO depends on SG. */ if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); features &= ~NETIF_F_GSO; } /* GSO partial features require GSO partial be set */ if ((features & dev->gso_partial_features) && !(features & NETIF_F_GSO_PARTIAL)) { netdev_dbg(dev, "Dropping partially supported GSO features since no GSO partial.\n"); features &= ~dev->gso_partial_features; } if (!(features & NETIF_F_RXCSUM)) { /* NETIF_F_GRO_HW implies doing RXCSUM since every packet * successfully merged by hardware must also have the * checksum verified by hardware. If the user does not * want to enable RXCSUM, logically, we should disable GRO_HW. */ if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); features &= ~NETIF_F_GRO_HW; } } /* LRO/HW-GRO features cannot be combined with RX-FCS */ if (features & NETIF_F_RXFCS) { if (features & NETIF_F_LRO) { netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_LRO; } if (features & NETIF_F_GRO_HW) { netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); features &= ~NETIF_F_GRO_HW; } } if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); features &= ~NETIF_F_HW_TLS_RX; } return features; } int __netdev_update_features(struct net_device *dev) { struct net_device *upper, *lower; netdev_features_t features; struct list_head *iter; int err = -1; ASSERT_RTNL(); features = netdev_get_wanted_features(dev); if (dev->netdev_ops->ndo_fix_features) features = dev->netdev_ops->ndo_fix_features(dev, features); /* driver might be less strict about feature dependencies */ features = netdev_fix_features(dev, features); /* some features can't be enabled if they're off an an upper device */ netdev_for_each_upper_dev_rcu(dev, upper, iter) features = netdev_sync_upper_features(dev, upper, features); if (dev->features == features) goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); else err = 0; if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted %pNF, left %pNF\n", err, &features, &dev->features); /* return non-0 since some features might have changed and * it's better to fire a spurious notification than miss it */ return -1; } sync_lower: /* some features must be disabled on lower devices when disabled * on an upper device (think: bonding master or bridge) */ netdev_for_each_lower_dev(dev, lower, iter) netdev_sync_lower_features(dev, lower, features); if (!err) { netdev_features_t diff = features ^ dev->features; if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) { /* udp_tunnel_{get,drop}_rx_info both need * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the * device, or they won't do anything. * Thus we need to update dev->features * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { dev->features = features; err |= vlan_get_rx_ctag_filter_info(dev); } else { vlan_drop_rx_ctag_filter_info(dev); } } if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { if (features & NETIF_F_HW_VLAN_STAG_FILTER) { dev->features = features; err |= vlan_get_rx_stag_filter_info(dev); } else { vlan_drop_rx_stag_filter_info(dev); } } dev->features = features; } return err < 0 ? 0 : 1; } /** * netdev_update_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications if it * has changed. Should be called after driver or hardware dependent * conditions might have changed that influence the features. */ void netdev_update_features(struct net_device *dev) { if (__netdev_update_features(dev)) netdev_features_change(dev); } EXPORT_SYMBOL(netdev_update_features); /** * netdev_change_features - recalculate device features * @dev: the device to check * * Recalculate dev->features set and send notifications even * if they have not changed. Should be called instead of * netdev_update_features() if also dev->vlan_features might * have changed to allow the changes to be propagated to stacked * VLAN devices. */ void netdev_change_features(struct net_device *dev) { __netdev_update_features(dev); netdev_features_change(dev); } EXPORT_SYMBOL(netdev_change_features); /** * netif_stacked_transfer_operstate - transfer operstate * @rootdev: the root or lower level device to transfer state from * @dev: the device to transfer operstate to * * Transfer operational state from root to device. This is normally * called when a stacking relationship exists between the root * device and the device(a leaf device). */ void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev) { if (rootdev->operstate == IF_OPER_DORMANT) netif_dormant_on(dev); else netif_dormant_off(dev); if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); int err = 0; BUG_ON(count < 1); rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; dev->_rx = rx; for (i = 0; i < count; i++) { rx[i].dev = dev; /* XDP RX-queue setup */ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); if (err < 0) goto err_rxq_info; } return 0; err_rxq_info: /* Rollback successful reg's and free other resources */ while (i--) xdp_rxq_info_unreg(&rx[i].xdp_rxq); kvfree(dev->_rx); dev->_rx = NULL; return err; } static void netif_free_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ if (!dev->_rx) return; for (i = 0; i < count; i++) xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); kvfree(dev->_rx); } static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; #ifdef CONFIG_BQL dql_init(&queue->dql, HZ); #endif } static void netif_free_tx_queues(struct net_device *dev) { kvfree(dev->_tx); } static int netif_alloc_netdev_queues(struct net_device *dev) { unsigned int count = dev->num_tx_queues; struct netdev_queue *tx; size_t sz = count * sizeof(*tx); if (count < 1 || count > 0xffff) return -EINVAL; tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); return 0; } void netif_tx_stop_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_stop_queue(txq); } } EXPORT_SYMBOL(netif_tx_stop_all_queues); /** * register_netdevice - register a network device * @dev: device to register * * Take a completed network device structure and add it to the kernel * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * * Callers must hold the rtnl semaphore. You may want * register_netdev() instead of this. * * BUGS: * The locking appears insufficient to guarantee two parallel registers * will not get the same name. */ int register_netdevice(struct net_device *dev) { int ret; struct net *net = dev_net(dev); BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); might_sleep(); /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); BUG_ON(!net); spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) goto out; /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; goto out; } } if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_CTAG_FILTER) && (!dev->netdev_ops->ndo_vlan_rx_add_vid || !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); ret = -EINVAL; goto err_uninit; } ret = -EBUSY; if (!dev->ifindex) dev->ifindex = dev_new_index(net); else if (__dev_get_by_index(net, dev->ifindex)) goto err_uninit; /* Transfer changeable features to wanted_features and enable * software offloads (GSO and GRO). */ dev->hw_features |= NETIF_F_SOFT_FEATURES; dev->features |= NETIF_F_SOFT_FEATURES; if (dev->netdev_ops->ndo_udp_tunnel_add) { dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT; } dev->wanted_features = dev->features & dev->hw_features; if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; /* If IPv4 TCP segmentation offload is supported we should also * allow the device to enable segmenting the frame with the option * of ignoring a static IP ID value. This doesn't enable the * feature itself but allows the user to enable it later. */ if (dev->hw_features & NETIF_F_TSO) dev->hw_features |= NETIF_F_TSO_MANGLEID; if (dev->vlan_features & NETIF_F_TSO) dev->vlan_features |= NETIF_F_TSO_MANGLEID; if (dev->mpls_features & NETIF_F_TSO) dev->mpls_features |= NETIF_F_TSO_MANGLEID; if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; /* Make NETIF_F_SG inheritable to tunnel devices. */ dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ dev->mpls_features |= NETIF_F_SG; ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) goto err_uninit; ret = netdev_register_kobject(dev); if (ret) { dev->reg_state = NETREG_UNREGISTERED; goto err_uninit; } dev->reg_state = NETREG_REGISTERED; __netdev_update_features(dev); /* * Default initial state at registry is that the * device is present. */ set_bit(__LINK_STATE_PRESENT, &dev->state); linkwatch_init_dev(dev); dev_init_scheduler(dev); dev_hold(dev); list_netdevice(dev); add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should * set dev_addr and also addr_assign_type should be set to * NET_ADDR_PERM (default value). */ if (dev->addr_assign_type == NET_ADDR_PERM) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); ret = notifier_to_errno(ret); if (ret) { rollback_registered(dev); rcu_barrier(); dev->reg_state = NETREG_UNREGISTERED; /* We should put the kobject that hold in * netdev_unregister_kobject(), otherwise * the net device cannot be freed when * driver calls free_netdev(), because the * kobject is being hold. */ kobject_put(&dev->dev.kobj); } /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); out: return ret; err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) dev->priv_destructor(dev); goto out; } EXPORT_SYMBOL(register_netdevice); /** * init_dummy_netdev - init a dummy network device for NAPI * @dev: device to init * * This takes a network device structure and initialize the minimum * amount of fields so it can be used to schedule NAPI polls without * registering a full blown interface. This is to be used by drivers * that need to tie several hardware interfaces to a single NAPI * poll scheduler due to HW limitations. */ int init_dummy_netdev(struct net_device *dev) { /* Clear everything. Note we don't initialize spinlocks * are they aren't supposed to be taken by any of the * NAPI code and this dummy netdev is supposed to be * only ever used for NAPI polls */ memset(dev, 0, sizeof(struct net_device)); /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); /* napi_busy_loop stats accounting wants this */ dev_net_set(dev, &init_net); /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. */ return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); /** * register_netdev - register a network device * @dev: device to register * * Take a completed network device structure and add it to the kernel * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier * chain. 0 is returned on success. A negative errno code is returned * on a failure to set up the device, or if the name is a duplicate. * * This is a wrapper around register_netdevice that takes the rtnl semaphore * and expands the device name if you passed a format string to * alloc_netdev. */ int register_netdev(struct net_device *dev) { int err; if (rtnl_lock_killable()) return -EINTR; err = register_netdevice(dev); rtnl_unlock(); return err; } EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; } EXPORT_SYMBOL(netdev_refcnt_read); /** * netdev_wait_allrefs - wait until all references are gone. * @dev: target net_device * * This is called when unregistering network devices. * * Any protocol or device that holds a reference should register * for netdevice notification, and cleanup and put back the * reference if they receive an UNREGISTER event. * We can get stuck here if buggy protocols don't correctly * call dev_put. */ static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; int refcnt; linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; refcnt = netdev_refcnt_read(dev); while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); /* Rebroadcast unregister notification */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); __rtnl_unlock(); rcu_barrier(); rtnl_lock(); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events * pending on unregister. If this * happens, we simply run the queue * unscheduled, resulting in a noop * for this device. */ linkwatch_run_queue(); } __rtnl_unlock(); rebroadcast_time = jiffies; } msleep(250); refcnt = netdev_refcnt_read(dev); if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; } } } /* The sequence is: * * rtnl_lock(); * ... * register_netdevice(x1); * register_netdevice(x2); * ... * unregister_netdevice(y1); * unregister_netdevice(y2); * ... * rtnl_unlock(); * free_netdev(y1); * free_netdev(y2); * * We are invoked by rtnl_unlock(). * This allows us to deal with problems: * 1) We can delete sysfs objects which invoke hotplug * without deadlocking with linkwatch via keventd. * 2) Since we run with the RTNL semaphore not held, we can sleep * safely in order to wait for the netdev refcnt to drop to zero. * * We must not return until all unregister events added during * the interval the lock was held have been completed. */ void netdev_run_todo(void) { struct list_head list; /* Snapshot list, allow later requests */ list_replace_init(&net_todo_list, &list); __rtnl_unlock(); /* Wait for rcu callbacks to finish before next phase */ if (!list_empty(&list)) rcu_barrier(); while (!list_empty(&list)) { struct net_device *dev = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", dev->name, dev->reg_state); dump_stack(); continue; } dev->reg_state = NETREG_UNREGISTERED; netdev_wait_allrefs(dev); /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); #if IS_ENABLED(CONFIG_DECNET) WARN_ON(dev->dn_ptr); #endif if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) free_netdev(dev); /* Report a network device has been unregistered */ rtnl_lock(); dev_net(dev)->dev_unreg_count--; __rtnl_unlock(); wake_up(&netdev_unregistering_wq); /* Free network device */ kobject_put(&dev->dev.kobj); } } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has * all the same fields in the same order as net_device_stats, with only * the type differing, but rtnl_link_stats64 may have additional fields * at the end for newer counters. */ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { #if BITS_PER_LONG == 64 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); memcpy(stats64, netdev_stats, sizeof(*netdev_stats)); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + sizeof(*netdev_stats), 0, sizeof(*stats64) - sizeof(*netdev_stats)); #else size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); const unsigned long *src = (const unsigned long *)netdev_stats; u64 *dst = (u64 *)stats64; BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) dst[i] = src[i]; /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); #endif } EXPORT_SYMBOL(netdev_stats_to_stats64); /** * dev_get_stats - get network device statistics * @dev: device to get statistics from * @storage: place to store stats * * Get network statistics from device. Return @storage. * The device driver may provide its own method by setting * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; * otherwise the internal statistics structure is used. */ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); } else { netdev_stats_to_stats64(storage, &dev->stats); } storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped); storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped); storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler); return storage; } EXPORT_SYMBOL(dev_get_stats); struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); #ifdef CONFIG_NET_CLS_ACT if (queue) return queue; queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); queue->qdisc_sleeping = &noop_qdisc; rcu_assign_pointer(dev->ingress_queue, queue); #endif return queue; } static const struct ethtool_ops default_ethtool_ops; void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops) { if (dev->ethtool_ops == &default_ethtool_ops) dev->ethtool_ops = ops; } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); void netdev_freemem(struct net_device *dev) { char *addr = (char *)dev - dev->padded; kvfree(addr); } /** * alloc_netdev_mqs - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @name_assign_type: origin of device name * @setup: callback to initialize device * @txqs: the number of TX subqueues to allocate * @rxqs: the number of RX subqueues to allocate * * Allocates a struct net_device with private data area for driver use * and performs basic initialization. Also allocates subqueue structs * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), unsigned int txqs, unsigned int rxqs) { struct net_device *dev; unsigned int alloc_size; struct net_device *p; BUG_ON(strlen(name) >= sizeof(dev->name)); if (txqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); return NULL; } if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); alloc_size += sizeof_priv; } /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!p) return NULL; dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; if (dev_addr_init(dev)) goto free_pcpu; dev_mc_init(dev); dev_uc_init(dev); dev_net_set(dev, &init_net); dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper); INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); if (!dev->tx_queue_len) { dev->priv_flags |= IFF_NO_QUEUE; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; } dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) goto free_all; dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; strcpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; nf_hook_ingress_init(dev); return dev; free_all: free_netdev(dev); return NULL; free_pcpu: free_percpu(dev->pcpu_refcnt); free_dev: netdev_freemem(dev); return NULL; } EXPORT_SYMBOL(alloc_netdev_mqs); /** * free_netdev - free network device * @dev: device * * This function does the last stage of destroying an allocated device * interface. The reference to the device object is released. If this * is the last reference then it will be freed.Must be called in process * context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; might_sleep(); netif_free_tx_queues(dev); netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); /* Flush device addresses */ dev_addr_flush(dev); list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); return; } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); dev->reg_state = NETREG_RELEASED; /* will free via device release */ put_device(&dev->dev); } EXPORT_SYMBOL(free_netdev); /** * synchronize_net - Synchronize with packet receive processing * * Wait for packets currently being received to be done. * Does not block later packets from starting. */ void synchronize_net(void) { might_sleep(); if (rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); } EXPORT_SYMBOL(synchronize_net); /** * unregister_netdevice_queue - remove device from the kernel * @dev: device * @head: list * * This function shuts down a device interface and removes it * from the kernel tables. * If head not NULL, device is queued to be unregistered later. * * Callers must hold the rtnl semaphore. You may want * unregister_netdev() instead of this. */ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) { ASSERT_RTNL(); if (head) { list_move_tail(&dev->unreg_list, head); } else { rollback_registered(dev); /* Finish processing unregister after unlock */ net_set_todo(dev); } } EXPORT_SYMBOL(unregister_netdevice_queue); /** * unregister_netdevice_many - unregister many devices * @head: list of devices * * Note: As most callers use a stack allocated list_head, * we force a list_del() to make sure stack wont be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { struct net_device *dev; if (!list_empty(head)) { rollback_registered_many(head); list_for_each_entry(dev, head, unreg_list) net_set_todo(dev); list_del(head); } } EXPORT_SYMBOL(unregister_netdevice_many); /** * unregister_netdev - remove device from the kernel * @dev: device * * This function shuts down a device interface and removes it * from the kernel tables. * * This is just a wrapper for unregister_netdevice that takes * the rtnl semaphore. In general you want to use this and not * unregister_netdevice. */ void unregister_netdev(struct net_device *dev) { rtnl_lock(); unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(unregister_netdev); /** * dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on * a failure a netagive errno code is returned. * * Callers must hold the rtnl semaphore. */ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { int err, new_nsid, new_ifindex; ASSERT_RTNL(); /* Don't allow namespace local devices to be moved. */ err = -EINVAL; if (dev->features & NETIF_F_NETNS_LOCAL) goto out; /* Ensure the device has been registrered */ if (dev->reg_state != NETREG_REGISTERED) goto out; /* Get out if there is nothing todo */ err = 0; if (net_eq(dev_net(dev), net)) goto out; /* Pick the destination device name, and ensure * we can use it in the destination network namespace. */ err = -EEXIST; if (__dev_get_by_name(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; err = dev_get_valid_name(net, dev, pat); if (err < 0) goto out; } /* * And now a mini version of register_netdevice unregister_netdevice. */ /* If device is running close it first. */ dev_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); synchronize_net(); /* Shutdown queueing discipline. */ dev_shutdown(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. * * Note that dev->reg_state stays at NETREG_REGISTERED. * This is wanted because this way 8021q and macvlan know * the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); /* If there is an ifindex conflict assign a new one */ if (__dev_get_by_index(net, dev->ifindex)) new_ifindex = dev_new_index(net); else new_ifindex = dev->ifindex; rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); /* * Flush the unicast and multicast chains */ dev_uc_flush(dev); dev_mc_flush(dev); /* Send a netdev-removed uevent to the old namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); netdev_adjacent_del_links(dev); /* Actually switch the network namespace */ dev_net_set(dev, net); dev->ifindex = new_ifindex; /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); netdev_adjacent_add_links(dev); /* Fixup kobjects */ err = device_rename(&dev->dev, dev->name); WARN_ON(err); /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); /* * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); synchronize_net(); err = 0; out: return err; } EXPORT_SYMBOL_GPL(dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { struct sk_buff **list_skb; struct sk_buff *skb; unsigned int cpu; struct softnet_data *sd, *oldsd, *remsd = NULL; local_irq_disable(); cpu = smp_processor_id(); sd = &per_cpu(softnet_data, cpu); oldsd = &per_cpu(softnet_data, oldcpu); /* Find end of our completion_queue. */ list_skb = &sd->completion_queue; while (*list_skb) list_skb = &(*list_skb)->next; /* Append completion queue from offline CPU. */ *list_skb = oldsd->completion_queue; oldsd->completion_queue = NULL; /* Append output queue from offline CPU. */ if (oldsd->output_queue) { *sd->output_queue_tailp = oldsd->output_queue; sd->output_queue_tailp = oldsd->output_queue_tailp; oldsd->output_queue = NULL; oldsd->output_queue_tailp = &oldsd->output_queue; } /* Append NAPI poll list from offline CPU, with one exception : * process_backlog() must be called by cpu owning percpu backlog. * We properly handle process_queue & input_pkt_queue later. */ while (!list_empty(&oldsd->poll_list)) { struct napi_struct *napi = list_first_entry(&oldsd->poll_list, struct napi_struct, poll_list); list_del_init(&napi->poll_list); if (napi->poll == process_backlog) napi->state = 0; else ____napi_schedule(sd, napi); } raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); #ifdef CONFIG_RPS remsd = oldsd->rps_ipi_list; oldsd->rps_ipi_list = NULL; #endif /* send out pending IPI's on offline CPU */ net_rps_send_ipi(remsd); /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx_ni(skb); input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { netif_rx_ni(skb); input_queue_head_incr(oldsd); } return 0; } /** * netdev_increment_features - increment feature set by one * @all: current feature set * @one: new feature set * @mask: mask feature set * * Computes a new feature set after adding a device with feature set * @one to the master device with current feature set @all. Will not * enable anything that is off in @mask. Returns the new feature set. */ netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask) { if (mask & NETIF_F_HW_CSUM) mask |= NETIF_F_CSUM_MASK; mask |= NETIF_F_VLAN_CHALLENGED; all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; all &= one | ~NETIF_F_ALL_FOR_ALL; /* If one device supports hw checksumming, set for all. */ if (all & NETIF_F_HW_CSUM) all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); return all; } EXPORT_SYMBOL(netdev_increment_features); static struct hlist_head * __net_init netdev_create_hash(void) { int i; struct hlist_head *hash; hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } /* Initialize per network namespace state */ static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask)); if (net != &init_net) INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) goto err_name; net->dev_index_head = netdev_create_hash(); if (net->dev_index_head == NULL) goto err_idx; return 0; err_idx: kfree(net->dev_name_head); err_name: return -ENOMEM; } /** * netdev_drivername - network driver for the device * @dev: network device * * Determine network driver for device. */ const char *netdev_drivername(const struct net_device *dev) { const struct device_driver *driver; const struct device *parent; const char *empty = ""; parent = dev->dev.parent; if (!parent) return empty; driver = parent->driver; if (driver && driver->name) return driver->name; return empty; } static void __netdev_printk(const char *level, const struct net_device *dev, struct va_format *vaf) { if (dev && dev->dev.parent) { dev_printk_emit(level[1] - '0', dev->dev.parent, "%s %s %s%s: %pV", dev_driver_string(dev->dev.parent), dev_name(dev->dev.parent), netdev_name(dev), netdev_reg_state(dev), vaf); } else if (dev) { printk("%s%s%s: %pV", level, netdev_name(dev), netdev_reg_state(dev), vaf); } else { printk("%s(NULL net_device): %pV", level, vaf); } } void netdev_printk(const char *level, const struct net_device *dev, const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; __netdev_printk(level, dev, &vaf); va_end(args); } EXPORT_SYMBOL(netdev_printk); #define define_netdev_printk_level(func, level) \ void func(const struct net_device *dev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __netdev_printk(level, dev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_netdev_printk_level(netdev_emerg, KERN_EMERG); define_netdev_printk_level(netdev_alert, KERN_ALERT); define_netdev_printk_level(netdev_crit, KERN_CRIT); define_netdev_printk_level(netdev_err, KERN_ERR); define_netdev_printk_level(netdev_warn, KERN_WARNING); define_netdev_printk_level(netdev_notice, KERN_NOTICE); define_netdev_printk_level(netdev_info, KERN_INFO); static void __net_exit netdev_exit(struct net *net) { kfree(net->dev_name_head); kfree(net->dev_index_head); if (net != &init_net) WARN_ON_ONCE(!list_empty(&net->dev_base_head)); } static struct pernet_operations __net_initdata netdev_net_ops = { .init = netdev_init, .exit = netdev_exit, }; static void __net_exit default_device_exit(struct net *net) { struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ rtnl_lock(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ if (dev->features & NETIF_F_NETNS_LOCAL) continue; /* Leave virtual devices for the generic cleanup */ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund) continue; /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (__dev_get_by_name(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); BUG(); } } rtnl_unlock(); } static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) { /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace in net_list. */ struct net *net; bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { unregistering = false; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { if (net->dev_unreg_count > 0) { unregistering = true; break; } } if (!unregistering) break; __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&netdev_unregistering_wq, &wait); } static void __net_exit default_device_exit_batch(struct list_head *net_list) { /* At exit all network devices most be removed from a network * namespace. Do this in the reverse order of registration. * Do this across as many network namespaces as possible to * improve batching efficiency. */ struct net_device *dev; struct net *net; LIST_HEAD(dev_kill_list); /* To prevent network device cleanup code from dereferencing * loopback devices or network devices that have been freed * wait here for all pending unregistrations to complete, * before unregistring the loopback device and allowing the * network namespace be freed. * * The netdev todo list containing all network devices * unregistrations that happen in default_device_exit_batch * will run in the rtnl_unlock() at the end of * default_device_exit_batch. */ rtnl_lock_unregistering(net_list); list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) dev->rtnl_link_ops->dellink(dev, &dev_kill_list); else unregister_netdevice_queue(dev, &dev_kill_list); } } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); } static struct pernet_operations __net_initdata default_device_ops = { .exit = default_device_exit, .exit_batch = default_device_exit_batch, }; /* * Initialize the DEV module. At boot time this walks the device list and * unhooks any devices that fail to initialise (normally hardware not * present) and leaves us with a valid list of present and active devices. * */ /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. */ static int __init net_dev_init(void) { int i, rc = -ENOMEM; BUG_ON(!dev_boot_phase); if (dev_proc_init()) goto out; if (netdev_kobject_init()) goto out; INIT_LIST_HEAD(&ptype_all); for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); INIT_LIST_HEAD(&offload_base); if (register_pernet_subsys(&netdev_net_ops)) goto out; /* * Initialise the packet receive queues. */ for_each_possible_cpu(i) { struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); INIT_WORK(flush, flush_backlog); skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD skb_queue_head_init(&sd->xfrm_backlog); #endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS sd->csd.func = rps_trigger_softirq; sd->csd.info = sd; sd->cpu = i; #endif init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; } dev_boot_phase = 0; /* The loopback device is special if any other network devices * is present in a network namespace the loopback device must * be present. Since we now dynamically allocate and free the * loopback device ensure this invariant is maintained by * keeping the loopback device as the first device on the * list of network devices. Ensuring the loopback devices * is the first device that appears and the last network device * that disappears. */ if (register_pernet_device(&loopback_net_ops)) goto out; if (register_pernet_device(&default_device_ops)) goto out; open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); rc = 0; out: return rc; } subsys_initcall(net_dev_init); |
854 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | /* SPDX-License-Identifier: GPL-2.0 */ /* linux/include/linux/clockchips.h * * This file contains the structure definitions for clockchips. * * If you are not a clockchip, or the time of day code, you should * not be including this file! */ #ifndef _LINUX_CLOCKCHIPS_H #define _LINUX_CLOCKCHIPS_H #ifdef CONFIG_GENERIC_CLOCKEVENTS # include <linux/clocksource.h> # include <linux/cpumask.h> # include <linux/ktime.h> # include <linux/notifier.h> struct clock_event_device; struct module; /* * Possible states of a clock event device. * * DETACHED: Device is not used by clockevents core. Initial state or can be * reached from SHUTDOWN. * SHUTDOWN: Device is powered-off. Can be reached from PERIODIC or ONESHOT. * PERIODIC: Device is programmed to generate events periodically. Can be * reached from DETACHED or SHUTDOWN. * ONESHOT: Device is programmed to generate event only once. Can be reached * from DETACHED or SHUTDOWN. * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily * stopped. */ enum clock_event_state { CLOCK_EVT_STATE_DETACHED, CLOCK_EVT_STATE_SHUTDOWN, CLOCK_EVT_STATE_PERIODIC, CLOCK_EVT_STATE_ONESHOT, CLOCK_EVT_STATE_ONESHOT_STOPPED, }; /* * Clock event features */ # define CLOCK_EVT_FEAT_PERIODIC 0x000001 # define CLOCK_EVT_FEAT_ONESHOT 0x000002 # define CLOCK_EVT_FEAT_KTIME 0x000004 /* * x86(64) specific (mis)features: * * - Clockevent source stops in C3 State and needs broadcast support. * - Local APIC timer is used as a dummy device. */ # define CLOCK_EVT_FEAT_C3STOP 0x000008 # define CLOCK_EVT_FEAT_DUMMY 0x000010 /* * Core shall set the interrupt affinity dynamically in broadcast mode */ # define CLOCK_EVT_FEAT_DYNIRQ 0x000020 # define CLOCK_EVT_FEAT_PERCPU 0x000040 /* * Clockevent device is based on a hrtimer for broadcast */ # define CLOCK_EVT_FEAT_HRTIMER 0x000080 /** * struct clock_event_device - clock event device descriptor * @event_handler: Assigned by the framework to be called by the low * level handler of the event source * @set_next_event: set next event function using a clocksource delta * @set_next_ktime: set next event function using a direct ktime value * @next_event: local storage for the next event in oneshot mode * @max_delta_ns: maximum delta value in ns * @min_delta_ns: minimum delta value in ns * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) * @state_use_accessors:current state of the device, assigned by the core code * @features: features * @retries: number of forced programming retries * @set_state_periodic: switch state to periodic * @set_state_oneshot: switch state to oneshot * @set_state_oneshot_stopped: switch state to oneshot_stopped * @set_state_shutdown: switch state to shutdown * @tick_resume: resume clkevt device * @broadcast: function to broadcast events * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration * @max_delta_ticks: maximum delta value in ticks stored for reconfiguration * @name: ptr to clock event name * @rating: variable to rate clock event devices * @irq: IRQ number (only for non CPU local devices) * @bound_on: Bound on CPU * @cpumask: cpumask to indicate for which CPUs this device works * @list: list head for the management code * @owner: module reference */ struct clock_event_device { void (*event_handler)(struct clock_event_device *); int (*set_next_event)(unsigned long evt, struct clock_event_device *); int (*set_next_ktime)(ktime_t expires, struct clock_event_device *); ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; u32 mult; u32 shift; enum clock_event_state state_use_accessors; unsigned int features; unsigned long retries; int (*set_state_periodic)(struct clock_event_device *); int (*set_state_oneshot)(struct clock_event_device *); int (*set_state_oneshot_stopped)(struct clock_event_device *); int (*set_state_shutdown)(struct clock_event_device *); int (*tick_resume)(struct clock_event_device *); void (*broadcast)(const struct cpumask *mask); void (*suspend)(struct clock_event_device *); void (*resume)(struct clock_event_device *); unsigned long min_delta_ticks; unsigned long max_delta_ticks; const char *name; int rating; int irq; int bound_on; const struct cpumask *cpumask; struct list_head list; struct module *owner; } ____cacheline_aligned; /* Helpers to verify state of a clockevent device */ static inline bool clockevent_state_detached(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED; } static inline bool clockevent_state_shutdown(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN; } static inline bool clockevent_state_periodic(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC; } static inline bool clockevent_state_oneshot(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT; } static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED; } /* * Calculate a multiplication factor for scaled math, which is used to convert * nanoseconds based values to clock ticks: * * clock_ticks = (nanoseconds * factor) >> shift. * * div_sc is the rearranged equation to calculate a factor from a given clock * ticks / nanoseconds ratio: * * factor = (clock_ticks << shift) / nanoseconds */ static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec, int shift) { u64 tmp = ((u64)ticks) << shift; do_div(tmp, nsec); return (unsigned long) tmp; } /* Clock event layer functions */ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu); extern void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta); extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); static inline void clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec) { return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec); } extern void clockevents_suspend(void); extern void clockevents_resume(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST # ifdef CONFIG_ARCH_HAS_TICK_BROADCAST extern void tick_broadcast(const struct cpumask *mask); # else # define tick_broadcast NULL # endif extern int tick_receive_broadcast(void); # endif # if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); # else static inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } # endif #else /* !CONFIG_GENERIC_CLOCKEVENTS: */ static inline void clockevents_suspend(void) { } static inline void clockevents_resume(void) { } static inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #endif /* _LINUX_CLOCKCHIPS_H */ |
38 38 38 38 38 36 50 50 49 49 48 48 47 45 45 45 38 2 1 38 38 38 38 38 38 43 104 4 104 3 1 102 19 101 9 100 101 2 99 1 100 93 9 100 100 100 100 100 100 100 100 9 9 79 3 77 102 19 19 4 1 4 1 4 1 4 2 4 1 21 21 7 21 3 21 1 21 1 21 1 45 45 97 91 91 90 90 88 82 42 84 97 5 1 20 20 20 20 21 22 561 97 2 88 88 86 85 81 84 1 82 3 4 4 1 4 4 2 2 2 3 25 218 22 19 17 17 21 1 183 183 182 182 2 179 179 179 93 22 22 23 23 21 21 21 50 50 43 66 64 64 3 5 4 4 4 25 25 24 21 21 21 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ioctl.c * * Copyright (C) 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) */ #include <linux/fs.h> #include <linux/capability.h> #include <linux/time.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/quotaops.h> #include <linux/random.h> #include <linux/uuid.h> #include <linux/uaccess.h> #include <linux/delay.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" #include <linux/fsmap.h> #include "fsmap.h" #include <trace/events/ext4.h> /** * Swap memory between @a and @b for @len bytes. * * @a: pointer to first memory area * @b: pointer to second memory area * @len: number of bytes to swap * */ static void memswap(void *a, void *b, size_t len) { unsigned char *ap, *bp; ap = (unsigned char *)a; bp = (unsigned char *)b; while (len-- > 0) { swap(*ap, *bp); ap++; bp++; } } /** * Swap i_data and associated attributes between @inode1 and @inode2. * This function is used for the primary swap between inode1 and inode2 * and also to revert this primary swap in case of errors. * * Therefore you have to make sure, that calling this method twice * will revert all changes. * * @inode1: pointer to first inode * @inode2: pointer to second inode */ static void swap_inode_data(struct inode *inode1, struct inode *inode2) { loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; unsigned long tmp; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); swap(inode1->i_atime, inode2->i_atime); swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); i_size_write(inode2, isize); } static void reset_inode_seed(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); __u32 csum; if (!ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } /** * Swap the information from the given @inode and the inode * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other * important fields of the inodes. * * @sb: the super block of the filesystem * @inode: the inode to swap with EXT4_BOOT_LOADER_INO * */ static long swap_inode_boot_loader(struct super_block *sb, struct inode *inode) { handle_t *handle; int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; qsize_t size, size_bl, diff; blkcnt_t blocks; unsigned short bytes; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || ext4_has_inline_data(inode)) { err = -EINVAL; goto journal_err_out; } if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto journal_err_out; } down_write(&EXT4_I(inode)->i_mmap_sem); err = filemap_write_and_wait(inode->i_mapping); if (err) goto err_out; err = filemap_write_and_wait(inode_bl->i_mapping); if (err) goto err_out; /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(&inode_bl->i_data, 0); handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; goto err_out; } /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); if (inode_bl->i_nlink == 0) { /* this inode has never been used as a BOOT_LOADER */ set_nlink(inode_bl, 1); i_uid_write(inode_bl, 0); i_gid_write(inode_bl, 0); inode_bl->i_flags = 0; ei_bl->i_flags = 0; inode_set_iversion(inode_bl, 1); i_size_write(inode_bl, 0); inode_bl->i_mode = S_IFREG; if (ext4_has_feature_extents(sb)) { ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode_bl); } else memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } err = dquot_initialize(inode); if (err) goto err_out1; size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; diff = size - size_bl; swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); inode->i_generation = prandom_u32(); inode_bl->i_generation = prandom_u32(); reset_inode_seed(inode); reset_inode_seed(inode_bl); ext4_discard_preallocations(inode); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); goto err_out1; } blocks = inode_bl->i_blocks; bytes = inode_bl->i_bytes; inode_bl->i_blocks = inode->i_blocks; inode_bl->i_bytes = inode->i_bytes; err = ext4_mark_inode_dirty(handle, inode_bl); if (err < 0) { /* No need to update quota information. */ ext4_warning(inode_bl->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode_bl->i_ino, err); goto revert; } /* Bootloader inode should not be counted into quota information. */ if (diff > 0) dquot_free_space(inode, diff); else err = dquot_alloc_space(inode, -1 * diff); if (err < 0) { revert: /* Revert all changes: */ inode_bl->i_blocks = blocks; inode_bl->i_bytes = bytes; swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); ext4_mark_inode_dirty(handle, inode_bl); } err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); err_out: up_write(&EXT4_I(inode)->i_mmap_sem); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); return err; } #ifdef CONFIG_EXT4_FS_ENCRYPTION static int uuid_is_zero(__u8 u[16]) { int i; for (i = 0; i < 16; i++) if (u[i]) return 0; return 1; } #endif /* * If immutable is set and we are not clearing it, we're not allowed to change * anything else in the inode. Don't error out if we're only trying to set * immutable on an immutable file. */ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, unsigned int flags) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int oldflags = ei->i_flags; if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL)) return 0; if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL)) return -EPERM; if (ext4_has_feature_project(inode->i_sb) && __kprojid_val(ei->i_projid) != new_projid) return -EPERM; return 0; } static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) { struct ext4_inode_info *ei = EXT4_I(inode); handle_t *handle = NULL; int err = -EPERM, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags, mask, i; unsigned int jflag; /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) goto flags_out; oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ jflag = flags & EXT4_JOURNAL_DATA_FL; /* * The IMMUTABLE and APPEND_ONLY flags can only be changed by * the relevant capability. * * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) goto flags_out; } /* * The JOURNAL_DATA flag can only be changed by * the relevant capability. */ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; if (flags & EXT4_EOFBLOCKS_FL) { /* we don't support adding EOFBLOCKS flag */ if (!(oldflags & EXT4_EOFBLOCKS_FL)) { err = -EOPNOTSUPP; goto flags_out; } } else if (oldflags & EXT4_EOFBLOCKS_FL) { err = ext4_truncate(inode); if (err) goto flags_out; } /* * Wait for all pending directio and then flush all the dirty pages * for this file. The flush marks all the pages readonly, so any * subsequent attempt to write to the file (particularly mmap pages) * will come through the filesystem and fail. */ if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) && (flags & EXT4_IMMUTABLE_FL)) { inode_dio_wait(inode); err = filemap_write_and_wait(inode->i_mapping); if (err) goto flags_out; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto flags_out; } if (IS_SYNC(inode)) ext4_handle_sync(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto flags_err; for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; /* These flags get special treatment later */ if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) continue; if (mask & flags) ext4_set_inode_flag(inode, i); else ext4_clear_inode_flag(inode, i); } ext4_set_inode_flags(inode); inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext4_journal_stop(handle); if (err) goto flags_out; if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to * S_DAX if we are using the DAX mount option. */ if (test_opt(inode->i_sb, DAX)) { err = -EBUSY; goto flags_out; } err = ext4_change_inode_journal_flag(inode, jflag); if (err) goto flags_out; } if (migrate) { if (flags & EXT4_EXTENTS_FL) err = ext4_ext_migrate(inode); else err = ext4_ind_migrate(inode); } flags_out: return err; } #ifdef CONFIG_QUOTA static int ext4_ioctl_setproject(struct file *filp, __u32 projid) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); int err, rc; handle_t *handle; kprojid_t kprojid; struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct dquot *transfer_to[MAXQUOTAS] = { }; if (!ext4_has_feature_project(sb)) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; else return 0; } if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) return -EOPNOTSUPP; kprojid = make_kprojid(&init_user_ns, (projid_t)projid); if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) return 0; err = -EPERM; /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) return err; err = ext4_get_inode_loc(inode, &iloc); if (err) return err; raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { err = ext4_expand_extra_isize(inode, EXT4_SB(sb)->s_want_extra_isize, &iloc); if (err) return err; } else { brelse(iloc.bh); } err = dquot_initialize(inode); if (err) return err; handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); if (IS_ERR(handle)) return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out_stop; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { /* __dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); err = __dquot_transfer(inode, transfer_to); up_read(&EXT4_I(inode)->xattr_sem); dqput(transfer_to[PRJQUOTA]); if (err) goto out_dirty; } EXT4_I(inode)->i_projid = kprojid; inode->i_ctime = current_time(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; out_stop: ext4_journal_stop(handle); return err; } #else static int ext4_ioctl_setproject(struct file *filp, __u32 projid) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; return 0; } #endif /* Transfer internal flags to xflags */ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) { __u32 xflags = 0; if (iflags & EXT4_SYNC_FL) xflags |= FS_XFLAG_SYNC; if (iflags & EXT4_IMMUTABLE_FL) xflags |= FS_XFLAG_IMMUTABLE; if (iflags & EXT4_APPEND_FL) xflags |= FS_XFLAG_APPEND; if (iflags & EXT4_NODUMP_FL) xflags |= FS_XFLAG_NODUMP; if (iflags & EXT4_NOATIME_FL) xflags |= FS_XFLAG_NOATIME; if (iflags & EXT4_PROJINHERIT_FL) xflags |= FS_XFLAG_PROJINHERIT; return xflags; } #define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) { unsigned long iflags = 0; if (xflags & FS_XFLAG_SYNC) iflags |= EXT4_SYNC_FL; if (xflags & FS_XFLAG_IMMUTABLE) iflags |= EXT4_IMMUTABLE_FL; if (xflags & FS_XFLAG_APPEND) iflags |= EXT4_APPEND_FL; if (xflags & FS_XFLAG_NODUMP) iflags |= EXT4_NODUMP_FL; if (xflags & FS_XFLAG_NOATIME) iflags |= EXT4_NOATIME_FL; if (xflags & FS_XFLAG_PROJINHERIT) iflags |= EXT4_PROJINHERIT_FL; return iflags; } static int ext4_shutdown(struct super_block *sb, unsigned long arg) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u32 flags; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(flags, (__u32 __user *)arg)) return -EFAULT; if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; if (ext4_forced_shutdown(sbi)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); trace_ext4_shutdown(sb, flags); switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: freeze_bdev(sb->s_bdev); set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); thaw_bdev(sb->s_bdev, sb); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { (void) ext4_force_commit(sb); jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); } break; case EXT4_GOING_FLAGS_NOLOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); break; default: return -EINVAL; } clear_opt(sb, DISCARD); return 0; } struct getfsmap_info { struct super_block *gi_sb; struct fsmap_head __user *gi_data; unsigned int gi_idx; __u32 gi_last_flags; }; static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv) { struct getfsmap_info *info = priv; struct fsmap fm; trace_ext4_getfsmap_mapping(info->gi_sb, xfm); info->gi_last_flags = xfm->fmr_flags; ext4_fsmap_from_internal(info->gi_sb, &fm, xfm); if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm, sizeof(struct fsmap))) return -EFAULT; return 0; } static int ext4_ioc_getfsmap(struct super_block *sb, struct fsmap_head __user *arg) { struct getfsmap_info info = {0}; struct ext4_fsmap_head xhead = {0}; struct fsmap_head head; bool aborted = false; int error; if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) return -EFAULT; if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || memchr_inv(head.fmh_keys[0].fmr_reserved, 0, sizeof(head.fmh_keys[0].fmr_reserved)) || memchr_inv(head.fmh_keys[1].fmr_reserved, 0, sizeof(head.fmh_keys[1].fmr_reserved))) return -EINVAL; /* * ext4 doesn't report file extents at all, so the only valid * file offsets are the magic ones (all zeroes or all ones). */ if (head.fmh_keys[0].fmr_offset || (head.fmh_keys[1].fmr_offset != 0 && head.fmh_keys[1].fmr_offset != -1ULL)) return -EINVAL; xhead.fmh_iflags = head.fmh_iflags; xhead.fmh_count = head.fmh_count; ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]); ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]); trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]); trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]); info.gi_sb = sb; info.gi_data = arg; error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); if (error == EXT4_QUERY_RANGE_ABORT) { error = 0; aborted = true; } else if (error) return error; /* If we didn't abort, set the "last" flag in the last fmx */ if (!aborted && info.gi_idx) { info.gi_last_flags |= FMR_OF_LAST; if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags, &info.gi_last_flags, sizeof(info.gi_last_flags))) return -EFAULT; } /* copy back header */ head.fmh_entries = xhead.fmh_entries; head.fmh_oflags = xhead.fmh_oflags; if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) return -EFAULT; return 0; } static long ext4_ioctl_group_add(struct file *file, struct ext4_new_group_data *input) { struct super_block *sb = file_inode(file)->i_sb; int err, err2=0; err = ext4_resize_begin(sb); if (err) return err; if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; goto group_add_out; } err = mnt_want_write_file(file); if (err) goto group_add_out; err = ext4_group_add(sb, input); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(file); if (!err && ext4_has_group_desc_csum(sb) && test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, input->group); group_add_out: ext4_resize_end(sb); return err; } static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa) { /* * Project Quota ID state is only allowed to change from within the init * namespace. Enforce that restriction only if we are trying to change * the quota ID state. Everything else is allowed in user namespaces. */ if (current_user_ns() == &init_user_ns) return 0; if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid) return -EINVAL; if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) { if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) return -EINVAL; } else { if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) return -EINVAL; } return 0; } long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { case FS_IOC_GETFSMAP: return ext4_ioc_getfsmap(sb, (void __user *)arg); case EXT4_IOC_GETFLAGS: flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { int err; if (!inode_owner_or_capable(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) return -EFAULT; if (flags & ~EXT4_FL_USER_VISIBLE) return -EOPNOTSUPP; /* * chattr(1) grabs flags via GETFLAGS, modifies the result and * passes that to SETFLAGS. So we cannot easily make SETFLAGS * more restrictive than just silently masking off visible but * not settable flags as we always did. */ flags &= EXT4_FL_USER_MODIFIABLE; if (ext4_mask_flags(inode->i_mode, flags) != flags) return -EOPNOTSUPP; err = mnt_want_write_file(filp); if (err) return err; inode_lock(inode); err = ext4_ioctl_check_immutable(inode, from_kprojid(&init_user_ns, ei->i_projid), flags); if (!err) err = ext4_ioctl_setflags(inode, flags); inode_unlock(inode); mnt_drop_write_file(filp); return err; } case EXT4_IOC_GETVERSION: case EXT4_IOC_GETVERSION_OLD: return put_user(inode->i_generation, (int __user *) arg); case EXT4_IOC_SETVERSION: case EXT4_IOC_SETVERSION_OLD: { handle_t *handle; struct ext4_iloc iloc; __u32 generation; int err; if (!inode_owner_or_capable(inode)) return -EPERM; if (ext4_has_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; } err = mnt_want_write_file(filp); if (err) return err; if (get_user(generation, (int __user *) arg)) { err = -EFAULT; goto setversion_out; } inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto unlock_out; } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = current_time(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } ext4_journal_stop(handle); unlock_out: inode_unlock(inode); setversion_out: mnt_drop_write_file(filp); return err; } case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; int err, err2=0; err = ext4_resize_begin(sb); if (err) return err; if (get_user(n_blocks_count, (__u32 __user *)arg)) { err = -EFAULT; goto group_extend_out; } if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; goto group_extend_out; } err = mnt_want_write_file(filp); if (err) goto group_extend_out; err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(filp); group_extend_out: ext4_resize_end(sb); return err; } case EXT4_IOC_MOVE_EXT: { struct move_extent me; struct fd donor; int err; if (!(filp->f_mode & FMODE_READ) || !(filp->f_mode & FMODE_WRITE)) return -EBADF; if (copy_from_user(&me, (struct move_extent __user *)arg, sizeof(me))) return -EFAULT; me.moved_len = 0; donor = fdget(me.donor_fd); if (!donor.file) return -EBADF; if (!(donor.file->f_mode & FMODE_WRITE)) { err = -EBADF; goto mext_out; } if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); err = -EOPNOTSUPP; goto mext_out; } else if (IS_DAX(inode)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with DAX"); err = -EOPNOTSUPP; goto mext_out; } err = mnt_want_write_file(filp); if (err) goto mext_out; err = ext4_move_extents(filp, donor.file, me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; mext_out: fdput(donor); return err; } case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) return -EFAULT; return ext4_ioctl_group_add(filp, &input); } case EXT4_IOC_MIGRATE: { int err; if (!inode_owner_or_capable(inode)) return -EACCES; err = mnt_want_write_file(filp); if (err) return err; /* * inode_mutex prevent write and truncate on the file. * Read still goes through. We take i_data_sem in * ext4_ext_swap_inode_data before we switch the * inode format to prevent read. */ inode_lock((inode)); err = ext4_ext_migrate(inode); inode_unlock((inode)); mnt_drop_write_file(filp); return err; } case EXT4_IOC_ALLOC_DA_BLKS: { int err; if (!inode_owner_or_capable(inode)) return -EACCES; err = mnt_want_write_file(filp); if (err) return err; err = ext4_alloc_da_blocks(inode); mnt_drop_write_file(filp); return err; } case EXT4_IOC_SWAP_BOOT: { int err; if (!(filp->f_mode & FMODE_WRITE)) return -EBADF; err = mnt_want_write_file(filp); if (err) return err; err = swap_inode_boot_loader(sb, inode); mnt_drop_write_file(filp); return err; } case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; } err = ext4_resize_begin(sb); if (err) return err; err = mnt_want_write_file(filp); if (err) goto resizefs_out; err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(filp); if (!err && (o_group < EXT4_SB(sb)->s_groups_count) && ext4_has_group_desc_csum(sb) && test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, o_group); resizefs_out: ext4_resize_end(sb); return err; } case FITRIM: { struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!blk_queue_discard(q)) return -EOPNOTSUPP; /* * We haven't replayed the journal, so we cannot use our * block-bitmap-guided storage zapping commands. */ if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) return -EROFS; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; return 0; } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); case EXT4_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); case EXT4_IOC_GET_ENCRYPTION_PWSALT: { #ifdef CONFIG_EXT4_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); if (err) return err; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto pwsalt_err_exit; } err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto pwsalt_err_journal; lock_buffer(sbi->s_sbh); generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); pwsalt_err_journal: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; pwsalt_err_exit: mnt_drop_write_file(filp); if (err) return err; } if (copy_to_user((void __user *) arg, sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; #else return -EOPNOTSUPP; #endif } case EXT4_IOC_GET_ENCRYPTION_POLICY: return fscrypt_ioctl_get_policy(filp, (void __user *)arg); case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; memset(&fa, 0, sizeof(struct fsxattr)); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); if (ext4_has_feature_project(inode->i_sb)) { fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, EXT4_I(inode)->i_projid); } if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa))) return -EFAULT; return 0; } case EXT4_IOC_FSSETXATTR: { struct fsxattr fa; int err; if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa))) return -EFAULT; /* Make sure caller has proper permission */ if (!inode_owner_or_capable(inode)) return -EACCES; if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) return -EOPNOTSUPP; flags = ext4_xflags_to_iflags(fa.fsx_xflags); if (ext4_mask_flags(inode->i_mode, flags) != flags) return -EOPNOTSUPP; err = mnt_want_write_file(filp); if (err) return err; inode_lock(inode); err = ext4_ioctl_check_project(inode, &fa); if (err) goto out; flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags); if (err) goto out; err = ext4_ioctl_setflags(inode, flags); if (err) goto out; err = ext4_ioctl_setproject(filp, fa.fsx_projid); out: inode_unlock(inode); mnt_drop_write_file(filp); return err; } case EXT4_IOC_SHUTDOWN: return ext4_shutdown(sb, arg); default: return -ENOTTY; } } #ifdef CONFIG_COMPAT long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { case EXT4_IOC32_GETFLAGS: cmd = EXT4_IOC_GETFLAGS; break; case EXT4_IOC32_SETFLAGS: cmd = EXT4_IOC_SETFLAGS; break; case EXT4_IOC32_GETVERSION: cmd = EXT4_IOC_GETVERSION; break; case EXT4_IOC32_SETVERSION: cmd = EXT4_IOC_SETVERSION; break; case EXT4_IOC32_GROUP_EXTEND: cmd = EXT4_IOC_GROUP_EXTEND; break; case EXT4_IOC32_GETVERSION_OLD: cmd = EXT4_IOC_GETVERSION_OLD; break; case EXT4_IOC32_SETVERSION_OLD: cmd = EXT4_IOC_SETVERSION_OLD; break; case EXT4_IOC32_GETRSVSZ: cmd = EXT4_IOC_GETRSVSZ; break; case EXT4_IOC32_SETRSVSZ: cmd = EXT4_IOC_SETRSVSZ; break; case EXT4_IOC32_GROUP_ADD: { struct compat_ext4_new_group_input __user *uinput; struct ext4_new_group_data input; int err; uinput = compat_ptr(arg); err = get_user(input.group, &uinput->group); err |= get_user(input.block_bitmap, &uinput->block_bitmap); err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); err |= get_user(input.inode_table, &uinput->inode_table); err |= get_user(input.blocks_count, &uinput->blocks_count); err |= get_user(input.reserved_blocks, &uinput->reserved_blocks); if (err) return -EFAULT; return ext4_ioctl_group_add(file, &input); } case EXT4_IOC_MOVE_EXT: case EXT4_IOC_RESIZE_FS: case EXT4_IOC_PRECACHE_EXTENTS: case EXT4_IOC_SET_ENCRYPTION_POLICY: case EXT4_IOC_GET_ENCRYPTION_PWSALT: case EXT4_IOC_GET_ENCRYPTION_POLICY: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: break; default: return -ENOIOCTLCMD; } return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif |
63 63 63 63 63 63 48 63 37 45 31 31 63 63 63 63 63 63 63 63 63 3 3 3 64 63 63 2 2 64 63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 | /* * linux/fs/9p/trans_rdma.c * * RDMA transport layer based on the trans_fd.c implementation. * * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to: * Free Software Foundation * 51 Franklin Street, Fifth Floor * Boston, MA 02111-1301 USA * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/net.h> #include <linux/ipv6.h> #include <linux/kthread.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/parser.h> #include <linux/semaphore.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #define P9_PORT 5640 #define P9_RDMA_SQ_DEPTH 32 #define P9_RDMA_RQ_DEPTH 32 #define P9_RDMA_SEND_SGE 4 #define P9_RDMA_RECV_SGE 4 #define P9_RDMA_IRD 0 #define P9_RDMA_ORD 0 #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ #define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ /** * struct p9_trans_rdma - RDMA transport instance * * @state: tracks the transport state machine for connection setup and tear down * @cm_id: The RDMA CM ID * @pd: Protection Domain pointer * @qp: Queue Pair pointer * @cq: Completion Queue pointer * @timeout: Number of uSecs to wait for connection management events * @privport: Whether a privileged port may be used * @port: The port to use * @sq_depth: The depth of the Send Queue * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. * @rq_sem: Semaphore for the RQ * @excess_rc : Amount of posted Receive Contexts without a pending request. * See rdma_request() * @addr: The remote peer's address * @req_lock: Protects the active request list * @cm_done: Completion event for connection management tracking */ struct p9_trans_rdma { enum { P9_RDMA_INIT, P9_RDMA_ADDR_RESOLVED, P9_RDMA_ROUTE_RESOLVED, P9_RDMA_CONNECTED, P9_RDMA_FLUSHING, P9_RDMA_CLOSING, P9_RDMA_CLOSED, } state; struct rdma_cm_id *cm_id; struct ib_pd *pd; struct ib_qp *qp; struct ib_cq *cq; long timeout; bool privport; u16 port; int sq_depth; struct semaphore sq_sem; int rq_depth; struct semaphore rq_sem; atomic_t excess_rc; struct sockaddr_in addr; spinlock_t req_lock; struct completion cm_done; }; /** * p9_rdma_context - Keeps track of in-process WR * * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ struct p9_rdma_req; struct p9_rdma_context { struct ib_cqe cqe; dma_addr_t busa; union { struct p9_req_t *req; struct p9_fcall rc; }; }; /** * p9_rdma_opts - Collection of mount options * @port: port of connection * @sq_depth: The requested depth of the SQ. This really doesn't need * to be any deeper than the number of threads used in the client * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth * @timeout: Time to wait in msecs for CM events */ struct p9_rdma_opts { short port; bool privport; int sq_depth; int rq_depth; long timeout; }; /* * Option Parsing (code inspired by NFS code) */ enum { /* Options that take integer arguments */ Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, /* Options that take no argument */ Opt_privport, Opt_err, }; static match_table_t tokens = { {Opt_port, "port=%u"}, {Opt_sq_depth, "sq=%u"}, {Opt_rq_depth, "rq=%u"}, {Opt_timeout, "timeout=%u"}, {Opt_privport, "privport"}, {Opt_err, NULL}, }; static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) { struct p9_trans_rdma *rdma = clnt->trans; if (rdma->port != P9_PORT) seq_printf(m, ",port=%u", rdma->port); if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) seq_printf(m, ",sq=%u", rdma->sq_depth); if (rdma->rq_depth != P9_RDMA_RQ_DEPTH) seq_printf(m, ",rq=%u", rdma->rq_depth); if (rdma->timeout != P9_RDMA_TIMEOUT) seq_printf(m, ",timeout=%lu", rdma->timeout); if (rdma->privport) seq_puts(m, ",privport"); return 0; } /** * parse_opts - parse mount options into rdma options structure * @params: options string passed from mount * @opts: rdma transport-specific structure to parse options into * * Returns 0 upon success, -ERRNO upon failure */ static int parse_opts(char *params, struct p9_rdma_opts *opts) { char *p; substring_t args[MAX_OPT_ARGS]; int option; char *options, *tmp_options; opts->port = P9_PORT; opts->sq_depth = P9_RDMA_SQ_DEPTH; opts->rq_depth = P9_RDMA_RQ_DEPTH; opts->timeout = P9_RDMA_TIMEOUT; opts->privport = false; if (!params) return 0; tmp_options = kstrdup(params, GFP_KERNEL); if (!tmp_options) { p9_debug(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); return -ENOMEM; } options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { int token; int r; if (!*p) continue; token = match_token(p, tokens, args); if ((token != Opt_err) && (token != Opt_privport)) { r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); continue; } } switch (token) { case Opt_port: opts->port = option; break; case Opt_sq_depth: opts->sq_depth = option; break; case Opt_rq_depth: opts->rq_depth = option; break; case Opt_timeout: opts->timeout = option; break; case Opt_privport: opts->privport = true; break; default: continue; } } /* RQ must be at least as large as the SQ */ opts->rq_depth = max(opts->rq_depth, opts->sq_depth); kfree(tmp_options); return 0; } static int p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct p9_client *c = id->context; struct p9_trans_rdma *rdma = c->trans; switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: BUG_ON(rdma->state != P9_RDMA_INIT); rdma->state = P9_RDMA_ADDR_RESOLVED; break; case RDMA_CM_EVENT_ROUTE_RESOLVED: BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); rdma->state = P9_RDMA_ROUTE_RESOLVED; break; case RDMA_CM_EVENT_ESTABLISHED: BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); rdma->state = P9_RDMA_CONNECTED; break; case RDMA_CM_EVENT_DISCONNECTED: if (rdma) rdma->state = P9_RDMA_CLOSED; c->status = Disconnected; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: break; case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_CONNECT_REQUEST: case RDMA_CM_EVENT_CONNECT_RESPONSE: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_UNREACHABLE: c->status = Disconnected; rdma_disconnect(rdma->cm_id); break; default: BUG(); } complete(&rdma->cm_done); return 0; } static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct p9_client *client = cq->cq_context; struct p9_trans_rdma *rdma = client->trans; struct p9_rdma_context *c = container_of(wc->wr_cqe, struct p9_rdma_context, cqe); struct p9_req_t *req; int err = 0; int16_t tag; req = NULL; ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, DMA_FROM_DEVICE); if (wc->status != IB_WC_SUCCESS) goto err_out; c->rc.size = wc->byte_len; err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1); if (err) goto err_out; req = p9_tag_lookup(client, tag); if (!req) goto err_out; /* Check that we have not yet received a reply for this request. */ if (unlikely(req->rc.sdata)) { pr_err("Duplicate reply for request %d", tag); goto err_out; } req->rc.size = c->rc.size; req->rc.sdata = c->rc.sdata; p9_client_cb(client, req, REQ_STATUS_RCVD); out: up(&rdma->rq_sem); kfree(c); return; err_out: p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, wc->status); rdma->state = P9_RDMA_FLUSHING; client->status = Disconnected; goto out; } static void send_done(struct ib_cq *cq, struct ib_wc *wc) { struct p9_client *client = cq->cq_context; struct p9_trans_rdma *rdma = client->trans; struct p9_rdma_context *c = container_of(wc->wr_cqe, struct p9_rdma_context, cqe); ib_dma_unmap_single(rdma->cm_id->device, c->busa, c->req->tc.size, DMA_TO_DEVICE); up(&rdma->sq_sem); p9_req_put(c->req); kfree(c); } static void qp_event_handler(struct ib_event *event, void *context) { p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event, context); } static void rdma_destroy_trans(struct p9_trans_rdma *rdma) { if (!rdma) return; if (rdma->qp && !IS_ERR(rdma->qp)) ib_destroy_qp(rdma->qp); if (rdma->pd && !IS_ERR(rdma->pd)) ib_dealloc_pd(rdma->pd); if (rdma->cq && !IS_ERR(rdma->cq)) ib_free_cq(rdma->cq); if (rdma->cm_id && !IS_ERR(rdma->cm_id)) rdma_destroy_id(rdma->cm_id); kfree(rdma); } static int post_recv(struct p9_client *client, struct p9_rdma_context *c) { struct p9_trans_rdma *rdma = client->trans; struct ib_recv_wr wr; struct ib_sge sge; c->busa = ib_dma_map_single(rdma->cm_id->device, c->rc.sdata, client->msize, DMA_FROM_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; c->cqe.done = recv_done; sge.addr = c->busa; sge.length = client->msize; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; wr.wr_cqe = &c->cqe; wr.sg_list = &sge; wr.num_sge = 1; return ib_post_recv(rdma->qp, &wr, NULL); error: p9_debug(P9_DEBUG_ERROR, "EIO\n"); return -EIO; } static int rdma_request(struct p9_client *client, struct p9_req_t *req) { struct p9_trans_rdma *rdma = client->trans; struct ib_send_wr wr; struct ib_sge sge; int err = 0; unsigned long flags; struct p9_rdma_context *c = NULL; struct p9_rdma_context *rpl_context = NULL; /* When an error occurs between posting the recv and the send, * there will be a receive context posted without a pending request. * Since there is no way to "un-post" it, we remember it and skip * post_recv() for the next request. * So here, * see if we are this `next request' and need to absorb an excess rc. * If yes, then drop and free our own, and do not recv_post(). **/ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { /* Got one! */ p9_fcall_fini(&req->rc); req->rc.sdata = NULL; goto dont_need_post_recv; } else { /* We raced and lost. */ atomic_inc(&rdma->excess_rc); } } /* Allocate an fcall for the reply */ rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; goto recv_error; } rpl_context->rc.sdata = req->rc.sdata; /* * Post a receive buffer for this request. We need to ensure * there is a reply buffer available for every outstanding * request. A flushed request can result in no reply for an * outstanding request, so we must keep a count to avoid * overflowing the RQ. */ if (down_interruptible(&rdma->rq_sem)) { err = -EINTR; goto recv_error; } err = post_recv(client, rpl_context); if (err) { p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); goto recv_error; } /* remove posted receive buffer from request structure */ req->rc.sdata = NULL; dont_need_post_recv: /* Post the request */ c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; goto send_error; } c->req = req; c->busa = ib_dma_map_single(rdma->cm_id->device, c->req->tc.sdata, c->req->tc.size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { err = -EIO; goto send_error; } c->cqe.done = send_done; sge.addr = c->busa; sge.length = c->req->tc.size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; wr.wr_cqe = &c->cqe; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; wr.sg_list = &sge; wr.num_sge = 1; if (down_interruptible(&rdma->sq_sem)) { err = -EINTR; goto send_error; } /* Mark request as `sent' *before* we actually send it, * because doing if after could erase the REQ_STATUS_RCVD * status in case of a very fast reply. */ req->status = REQ_STATUS_SENT; err = ib_post_send(rdma->qp, &wr, NULL); if (err) goto send_error; /* Success */ return 0; /* Handle errors that happened during or while preparing the send: */ send_error: req->status = REQ_STATUS_ERROR; kfree(c); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); /* Ach. * We did recv_post(), but not send. We have one recv_post in excess. */ atomic_inc(&rdma->excess_rc); return err; /* Handle errors that happened during or while preparing post_recv(): */ recv_error: kfree(rpl_context); spin_lock_irqsave(&rdma->req_lock, flags); if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { rdma->state = P9_RDMA_CLOSING; spin_unlock_irqrestore(&rdma->req_lock, flags); rdma_disconnect(rdma->cm_id); } else spin_unlock_irqrestore(&rdma->req_lock, flags); return err; } static void rdma_close(struct p9_client *client) { struct p9_trans_rdma *rdma; if (!client) return; rdma = client->trans; if (!rdma) return; client->status = Disconnected; rdma_disconnect(rdma->cm_id); rdma_destroy_trans(rdma); } /** * alloc_rdma - Allocate and initialize the rdma transport structure * @opts: Mount options structure */ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) { struct p9_trans_rdma *rdma; rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); if (!rdma) return NULL; rdma->port = opts->port; rdma->privport = opts->privport; rdma->sq_depth = opts->sq_depth; rdma->rq_depth = opts->rq_depth; rdma->timeout = opts->timeout; spin_lock_init(&rdma->req_lock); init_completion(&rdma->cm_done); sema_init(&rdma->sq_sem, rdma->sq_depth); sema_init(&rdma->rq_sem, rdma->rq_depth); atomic_set(&rdma->excess_rc, 0); return rdma; } static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) { /* Nothing to do here. * We will take care of it (if we have to) in rdma_cancelled() */ return 1; } /* A request has been fully flushed without a reply. * That means we have posted one buffer in excess. */ static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) { struct p9_trans_rdma *rdma = client->trans; atomic_inc(&rdma->excess_rc); return 0; } static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) { struct sockaddr_in cl = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; int port, err = -EINVAL; for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) { cl.sin_port = htons((ushort)port); err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); if (err != -EADDRINUSE) break; } return err; } /** * rdma_create_trans - Transport method for creating a transport instance * @client: client instance * @addr: IP address string * @args: Mount options string */ static int rdma_create_trans(struct p9_client *client, const char *addr, char *args) { int err; struct p9_rdma_opts opts; struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; if (addr == NULL) return -EINVAL; /* Parse the transport specific mount options */ err = parse_opts(args, &opts); if (err < 0) return err; /* Create and initialize the RDMA transport structure */ rdma = alloc_rdma(&opts); if (!rdma) return -ENOMEM; /* Create the RDMA CM ID */ rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(rdma->cm_id)) goto error; /* Associate the client with the transport */ client->trans = rdma; /* Bind to a privileged port if we need to */ if (opts.privport) { err = p9_rdma_bind_privport(rdma); if (err < 0) { pr_err("%s (%d): problem binding to privport: %d\n", __func__, task_pid_nr(current), -err); goto error; } } /* Resolve the server's address */ rdma->addr.sin_family = AF_INET; rdma->addr.sin_addr.s_addr = in_aton(addr); rdma->addr.sin_port = htons(opts.port); err = rdma_resolve_addr(rdma->cm_id, NULL, (struct sockaddr *)&rdma->addr, rdma->timeout); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) goto error; /* Resolve the route to the server */ err = rdma_resolve_route(rdma->cm_id, rdma->timeout); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) goto error; /* Create the Completion Queue */ rdma->cq = ib_alloc_cq(rdma->cm_id->device, client, opts.sq_depth + opts.rq_depth + 1, 0, IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error; /* Create the Protection Domain */ rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0); if (IS_ERR(rdma->pd)) goto error; /* Create the Queue Pair */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = client; qp_attr.cap.max_send_wr = opts.sq_depth; qp_attr.cap.max_recv_wr = opts.rq_depth; qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = rdma->cq; qp_attr.recv_cq = rdma->cq; err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); if (err) goto error; rdma->qp = rdma->cm_id->qp; /* Request a connection */ memset(&conn_param, 0, sizeof(conn_param)); conn_param.private_data = NULL; conn_param.private_data_len = 0; conn_param.responder_resources = P9_RDMA_IRD; conn_param.initiator_depth = P9_RDMA_ORD; err = rdma_connect(rdma->cm_id, &conn_param); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_CONNECTED)) goto error; client->status = Connected; return 0; error: rdma_destroy_trans(rdma); return -ENOTCONN; } static struct p9_trans_module p9_rdma_trans = { .name = "rdma", .maxsize = P9_RDMA_MAXSIZE, .def = 0, .owner = THIS_MODULE, .create = rdma_create_trans, .close = rdma_close, .request = rdma_request, .cancel = rdma_cancel, .cancelled = rdma_cancelled, .show_options = p9_rdma_show_options, }; /** * p9_trans_rdma_init - Register the 9P RDMA transport driver */ static int __init p9_trans_rdma_init(void) { v9fs_register_trans(&p9_rdma_trans); return 0; } static void __exit p9_trans_rdma_exit(void) { v9fs_unregister_trans(&p9_rdma_trans); } module_init(p9_trans_rdma_init); module_exit(p9_trans_rdma_exit); MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); MODULE_DESCRIPTION("RDMA Transport for 9P"); MODULE_LICENSE("Dual BSD/GPL"); |
336 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | /* Authentication token and access key management internal defs * * Copyright (C) 2003-5, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #ifndef _INTERNAL_H #define _INTERNAL_H #include <linux/sched.h> #include <linux/wait_bit.h> #include <linux/cred.h> #include <linux/key-type.h> #include <linux/task_work.h> #include <linux/keyctl.h> #include <linux/refcount.h> #include <linux/compat.h> #include <linux/mm.h> #include <linux/vmalloc.h> struct iovec; #ifdef __KDEBUG #define kenter(FMT, ...) \ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) \ printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) \ printk(KERN_DEBUG " "FMT"\n", ##__VA_ARGS__) #else #define kenter(FMT, ...) \ no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) \ no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) \ no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) #endif extern struct key_type key_type_dead; extern struct key_type key_type_user; extern struct key_type key_type_logon; /*****************************************************************************/ /* * Keep track of keys for a user. * * This needs to be separate to user_struct to avoid a refcount-loop * (user_struct pins some keyrings which pin this struct). * * We also keep track of keys under request from userspace for this UID here. */ struct key_user { struct rb_node node; struct mutex cons_lock; /* construction initiation lock */ spinlock_t lock; refcount_t usage; /* for accessing qnkeys & qnbytes */ atomic_t nkeys; /* number of keys */ atomic_t nikeys; /* number of instantiated keys */ kuid_t uid; int qnkeys; /* number of keys allocated to this user */ int qnbytes; /* number of bytes allocated to this user */ }; extern struct rb_root key_user_tree; extern spinlock_t key_user_lock; extern struct key_user root_key_user; extern struct key_user *key_user_lookup(kuid_t uid); extern void key_user_put(struct key_user *user); /* * Key quota limits. * - root has its own separate limits to everyone else */ extern unsigned key_quota_root_maxkeys; extern unsigned key_quota_root_maxbytes; extern unsigned key_quota_maxkeys; extern unsigned key_quota_maxbytes; #define KEYQUOTA_LINK_BYTES 4 /* a link in a keyring is worth 4 bytes */ extern struct kmem_cache *key_jar; extern struct rb_root key_serial_tree; extern spinlock_t key_serial_lock; extern struct mutex key_construction_mutex; extern wait_queue_head_t request_key_conswq; extern struct key_type *key_type_lookup(const char *type); extern void key_type_put(struct key_type *ktype); extern int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit); extern int __key_link_check_live_key(struct key *keyring, struct key *key); extern void __key_link(struct key *key, struct assoc_array_edit **_edit); extern void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit); extern key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key); extern struct key *keyring_search_instkey(struct key *keyring, key_serial_t target_id); extern int iterate_over_keyring(const struct key *keyring, int (*func)(const struct key *key, void *data), void *data); struct keyring_search_context { struct keyring_index_key index_key; const struct cred *cred; struct key_match_data match_data; unsigned flags; #define KEYRING_SEARCH_NO_STATE_CHECK 0x0001 /* Skip state checks */ #define KEYRING_SEARCH_DO_STATE_CHECK 0x0002 /* Override NO_STATE_CHECK */ #define KEYRING_SEARCH_NO_UPDATE_TIME 0x0004 /* Don't update times */ #define KEYRING_SEARCH_NO_CHECK_PERM 0x0008 /* Don't check permissions */ #define KEYRING_SEARCH_DETECT_TOO_DEEP 0x0010 /* Give an error on excessive depth */ #define KEYRING_SEARCH_SKIP_EXPIRED 0x0020 /* Ignore expired keys (intention to replace) */ int (*iterator)(const void *object, void *iterator_data); /* Internal stuff */ int skipped_ret; bool possessed; key_ref_t result; time64_t now; }; extern bool key_default_cmp(const struct key *key, const struct key_match_data *match_data); extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, struct keyring_search_context *ctx); extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx); extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int install_user_keyrings(void); extern int install_thread_keyring_to_cred(struct cred *); extern int install_process_keyring_to_cred(struct cred *); extern int install_session_keyring_to_cred(struct cred *, struct key *); extern struct key *request_key_and_link(struct key_type *type, const char *description, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags); extern bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data); extern key_ref_t lookup_user_key(key_serial_t id, unsigned long flags, key_perm_t perm); #define KEY_LOOKUP_CREATE 0x01 #define KEY_LOOKUP_PARTIAL 0x02 #define KEY_LOOKUP_FOR_UNLINK 0x04 extern long join_session_keyring(const char *name); extern void key_change_session_keyring(struct callback_head *twork); extern struct work_struct key_gc_work; extern unsigned key_gc_delay; extern void keyring_gc(struct key *keyring, time64_t limit); extern void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type); extern void key_schedule_gc(time64_t gc_at); extern void key_schedule_gc_links(void); extern void key_gc_keytype(struct key_type *ktype); extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, key_perm_t perm); /* * Check to see whether permission is granted to use a key in the desired way. */ static inline int key_permission(const key_ref_t key_ref, unsigned perm) { return key_task_permission(key_ref, current_cred(), perm); } extern struct key_type key_type_request_key_auth; extern struct key *request_key_auth_new(struct key *target, const char *op, const void *callout_info, size_t callout_len, struct key *dest_keyring); extern struct key *key_get_instantiation_authkey(key_serial_t target_id); /* * Determine whether a key is dead. */ static inline bool key_is_dead(const struct key *key, time64_t limit) { return key->flags & ((1 << KEY_FLAG_DEAD) | (1 << KEY_FLAG_INVALIDATED)) || (key->expiry > 0 && key->expiry <= limit); } /* * keyctl() functions */ extern long keyctl_get_keyring_ID(key_serial_t, int); extern long keyctl_join_session_keyring(const char __user *); extern long keyctl_update_key(key_serial_t, const void __user *, size_t); extern long keyctl_revoke_key(key_serial_t); extern long keyctl_keyring_clear(key_serial_t); extern long keyctl_keyring_link(key_serial_t, key_serial_t); extern long keyctl_keyring_unlink(key_serial_t, key_serial_t); extern long keyctl_describe_key(key_serial_t, char __user *, size_t); extern long keyctl_keyring_search(key_serial_t, const char __user *, const char __user *, key_serial_t); extern long keyctl_read_key(key_serial_t, char __user *, size_t); extern long keyctl_chown_key(key_serial_t, uid_t, gid_t); extern long keyctl_setperm_key(key_serial_t, key_perm_t); extern long keyctl_instantiate_key(key_serial_t, const void __user *, size_t, key_serial_t); extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t); extern long keyctl_set_reqkey_keyring(int); extern long keyctl_set_timeout(key_serial_t, unsigned); extern long keyctl_assume_authority(key_serial_t); extern long keyctl_get_security(key_serial_t keyid, char __user *buffer, size_t buflen); extern long keyctl_session_to_parent(void); extern long keyctl_reject_key(key_serial_t, unsigned, unsigned, key_serial_t); extern long keyctl_instantiate_key_iov(key_serial_t, const struct iovec __user *, unsigned, key_serial_t); extern long keyctl_invalidate_key(key_serial_t); struct iov_iter; extern long keyctl_instantiate_key_common(key_serial_t, struct iov_iter *, key_serial_t); extern long keyctl_restrict_keyring(key_serial_t id, const char __user *_type, const char __user *_restriction); #ifdef CONFIG_PERSISTENT_KEYRINGS extern long keyctl_get_persistent(uid_t, key_serial_t); extern unsigned persistent_keyring_expiry; #else static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_KEY_DH_OPERATIONS extern long keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *, size_t, struct keyctl_kdf_params __user *); extern long __keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *, size_t, struct keyctl_kdf_params *); #ifdef CONFIG_KEYS_COMPAT extern long compat_keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct compat_keyctl_kdf_params __user *kdf); #endif #define KEYCTL_KDF_MAX_OUTPUT_LEN 1024 /* max length of KDF output */ #define KEYCTL_KDF_MAX_OI_LEN 64 /* max length of otherinfo */ #else static inline long keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params __user *kdf) { return -EOPNOTSUPP; } #ifdef CONFIG_KEYS_COMPAT static inline long compat_keyctl_dh_compute( struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params __user *kdf) { return -EOPNOTSUPP; } #endif #endif /* * Debugging key validation */ #ifdef KEY_DEBUGGING extern void __key_check(const struct key *); static inline void key_check(const struct key *key) { if (key && (IS_ERR(key) || key->magic != KEY_DEBUG_MAGIC)) __key_check(key); } #else #define key_check(key) do {} while(0) #endif #endif /* _INTERNAL_H */ |
1 1 1 1 1 1 1 1 1 1 43 42 43 41 218 218 117 117 43 43 78 77 20 20 41 25 41 11 197 18 190 70 51 44 51 46 51 4 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 110 110 77 73 1 77 26 76 56 1 1 56 76 58 1 1 57 76 1 1 12 76 76 76 76 2 76 5 3 3 3 3 1 1 3 2 1 3 3 3 2 26 14 26 14 14 14 105 72 72 72 72 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_itable.h" #include "xfs_quota.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_cksum.h" /* * The global quota manager. There is only one of these for the entire * system, _not_ one per file system. XQM keeps track of the overall * quota functionality, including maintaining the freelist and hash * tables of dquots. */ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC void xfs_qm_destroy_quotainos(xfs_quotainfo_t *qi); STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it * currently is the only interface into the radix tree code that allows * fuzzy lookups instead of exact matches. Holding the lock over multiple * operations is fine as all callers are used either during mount/umount * or quotaoff. */ #define XFS_DQ_LOOKUP_BATCH 32 STATIC int xfs_qm_dquot_walk( struct xfs_mount *mp, int type, int (*execute)(struct xfs_dquot *dqp, void *data), void *data) { struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); uint32_t next_index; int last_error = 0; int skipped; int nr_found; restart: skipped = 0; next_index = 0; nr_found = 0; while (1) { struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH]; int error = 0; int i; mutex_lock(&qi->qi_tree_lock); nr_found = radix_tree_gang_lookup(tree, (void **)batch, next_index, XFS_DQ_LOOKUP_BATCH); if (!nr_found) { mutex_unlock(&qi->qi_tree_lock); break; } for (i = 0; i < nr_found; i++) { struct xfs_dquot *dqp = batch[i]; next_index = be32_to_cpu(dqp->q_core.d_id) + 1; error = execute(batch[i], data); if (error == -EAGAIN) { skipped++; continue; } if (error && last_error != -EFSCORRUPTED) last_error = error; } mutex_unlock(&qi->qi_tree_lock); /* bail out if the filesystem is corrupted. */ if (last_error == -EFSCORRUPTED) { skipped = 0; break; } /* we're done if id overflows back to zero */ if (!next_index) break; } if (skipped) { delay(1); goto restart; } return last_error; } /* * Purge a dquot from all tracking data structures and free it. */ STATIC int xfs_qm_dqpurge( struct xfs_dquot *dqp, void *data) { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; xfs_dqlock(dqp); if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { xfs_dqunlock(dqp); return -EAGAIN; } dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqflock(dqp); /* * If we are turning this type of quotas off, we don't care * about the dirty metadata sitting in this dquot. OTOH, if * we're unmounting, we do care, so we flush it and wait. */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; int error; /* * We don't care about getting disk errors here. We need * to purge this dquot anyway, so we go ahead regardless. */ error = xfs_qm_dqflush(dqp, &bp); if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); } xfs_dqflock(dqp); } ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; /* * We move dquots to the freelist as soon as their reference count * hits zero, so it really should be on the freelist here. */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(mp, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; } /* * Purge the dquot cache. */ void xfs_qm_dqpurge_all( struct xfs_mount *mp, uint flags) { if (flags & XFS_QMOPT_UQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_GQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); if (flags & XFS_QMOPT_PQUOTA) xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); } /* * Just destroy the quotainfo structure. */ void xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); xfs_qm_destroy_quotainfo(mp); } } /* * Called from the vfsops layer. */ void xfs_qm_unmount_quotas( xfs_mount_t *mp) { /* * Release the dquots that root inode, et al might be holding, * before we flush quotas and blow away the quotainfo structure. */ ASSERT(mp->m_rootip); xfs_qm_dqdetach(mp->m_rootip); if (mp->m_rbmip) xfs_qm_dqdetach(mp->m_rbmip); if (mp->m_rsumip) xfs_qm_dqdetach(mp->m_rsumip); /* * Release the quota inodes. */ if (mp->m_quotainfo) { if (mp->m_quotainfo->qi_uquotaip) { xfs_irele(mp->m_quotainfo->qi_uquotaip); mp->m_quotainfo->qi_uquotaip = NULL; } if (mp->m_quotainfo->qi_gquotaip) { xfs_irele(mp->m_quotainfo->qi_gquotaip); mp->m_quotainfo->qi_gquotaip = NULL; } if (mp->m_quotainfo->qi_pquotaip) { xfs_irele(mp->m_quotainfo->qi_pquotaip); mp->m_quotainfo->qi_pquotaip = NULL; } } } STATIC int xfs_qm_dqattach_one( xfs_inode_t *ip, xfs_dqid_t id, uint type, bool doalloc, xfs_dquot_t **IO_idqpp) { xfs_dquot_t *dqp; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; /* * See if we already have it in the inode itself. IO_idqpp is &i_udquot * or &i_gdquot. This made the code look weird, but made the logic a lot * simpler. */ dqp = *IO_idqpp; if (dqp) { trace_xfs_dqattach_found(dqp); return 0; } /* * Find the dquot from somewhere. This bumps the reference count of * dquot and returns it locked. This can return ENOENT if dquot didn't * exist on disk and we didn't ask it to allocate; ESRCH if quotas got * turned off suddenly. */ error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp); if (error) return error; trace_xfs_dqattach_get(dqp); /* * dqget may have dropped and re-acquired the ilock, but it guarantees * that the dquot returned is the one that should go in the inode. */ *IO_idqpp = dqp; xfs_dqunlock(dqp); return 0; } static bool xfs_qm_need_dqattach( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; if (!XFS_IS_QUOTA_RUNNING(mp)) return false; if (!XFS_IS_QUOTA_ON(mp)) return false; if (!XFS_NOT_DQATTACHED(mp, ip)) return false; if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return false; return true; } /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON * into account. * If @doalloc is true, the dquot(s) will be allocated if needed. * Inode may get unlocked and relocked in here, and the caller must deal with * the consequences. */ int xfs_qm_dqattach_locked( xfs_inode_t *ip, bool doalloc) { xfs_mount_t *mp = ip->i_mount; int error = 0; if (!xfs_qm_need_dqattach(ip)) return 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; ASSERT(ip->i_pdquot); } done: /* * Don't worry about the dquots that we may have attached before any * error - they'll get detached later if it has not already been done. */ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); return error; } int xfs_qm_dqattach( struct xfs_inode *ip) { int error; if (!xfs_qm_need_dqattach(ip)) return 0; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, false); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* * Release dquots (and their references) if any. * The inode should be locked EXCL except when this's called by * xfs_ireclaim. */ void xfs_qm_dqdetach( xfs_inode_t *ip) { if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot)) return; trace_xfs_dquot_dqdetach(ip); ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino)); if (ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } if (ip->i_gdquot) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } if (ip->i_pdquot) { xfs_qm_dqrele(ip->i_pdquot); ip->i_pdquot = NULL; } } struct xfs_qm_isolate { struct list_head buffers; struct list_head dispose; }; static enum lru_status xfs_qm_dquot_isolate( struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __releases(lru_lock) __acquires(lru_lock) { struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); struct xfs_qm_isolate *isol = arg; if (!xfs_dqlock_nowait(dqp)) goto out_miss_busy; /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. */ if (dqp->q_nrefs) { xfs_dqunlock(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); list_lru_isolate(lru, &dqp->q_lru); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); return LRU_REMOVED; } /* * If the dquot is dirty, flush it. If it's already being flushed, just * skip it so there is time for the IO to complete before we try to * reclaim it again on the next LRU pass. */ if (!xfs_dqflock_nowait(dqp)) { xfs_dqunlock(dqp); goto out_miss_busy; } if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; int error; trace_xfs_dqreclaim_dirty(dqp); /* we have to drop the LRU lock to flush the dquot */ spin_unlock(lru_lock); error = xfs_qm_dqflush(dqp, &bp); if (error) goto out_unlock_dirty; xfs_buf_delwri_queue(bp, &isol->buffers); xfs_buf_relse(bp); goto out_unlock_dirty; } xfs_dqfunlock(dqp); /* * Prevent lookups now that we are past the point of no return. */ dqp->dq_flags |= XFS_DQ_FREEING; xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); return LRU_REMOVED; out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); return LRU_SKIP; out_unlock_dirty: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); xfs_dqunlock(dqp); spin_lock(lru_lock); return LRU_RETRY; } static unsigned long xfs_qm_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); struct xfs_qm_isolate isol; unsigned long freed; int error; if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM)) return 0; INIT_LIST_HEAD(&isol.buffers); INIT_LIST_HEAD(&isol.dispose); freed = list_lru_shrink_walk(&qi->qi_lru, sc, xfs_qm_dquot_isolate, &isol); error = xfs_buf_delwri_submit(&isol.buffers); if (error) xfs_warn(NULL, "%s: dquot reclaim failed", __func__); while (!list_empty(&isol.dispose)) { struct xfs_dquot *dqp; dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); list_del_init(&dqp->q_lru); xfs_qm_dqfree_one(dqp); } return freed; } static unsigned long xfs_qm_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); return list_lru_shrink_count(&qi->qi_lru, sc); } STATIC void xfs_qm_set_defquota( xfs_mount_t *mp, uint type, xfs_quotainfo_t *qinf) { xfs_dquot_t *dqp; struct xfs_def_quota *defq; struct xfs_disk_dquot *ddqp; int error; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; ddqp = &dqp->q_core; defq = xfs_get_defquota(dqp, qinf); /* * Timers and warnings have been already set, let's just set the * default limits for this quota type */ defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); xfs_qm_dqdestroy(dqp); } /* Initialize quota time limits from the root dquot. */ static void xfs_qm_init_timelimits( struct xfs_mount *mp, struct xfs_quotainfo *qinf) { struct xfs_disk_dquot *ddqp; struct xfs_dquot *dqp; uint type; int error; qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. * This is quite hacky, but it is standard quota practice. * * Since we may not have done a quotacheck by this point, just read * the dquot without attaching it to any hashtables or lists. * * Timers and warnings are globally set by the first timer found in * user/group/proj quota types, otherwise a default value is used. * This should be split into different fields per quota type. */ if (XFS_IS_UQUOTA_RUNNING(mp)) type = XFS_DQ_USER; else if (XFS_IS_GQUOTA_RUNNING(mp)) type = XFS_DQ_GROUP; else type = XFS_DQ_PROJ; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; ddqp = &dqp->q_core; /* * The warnings and timers set the grace period given to * a user or group before he or she can not perform any * more writing. If it is zero, a default is used. */ if (ddqp->d_btimer) qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer); if (ddqp->d_itimer) qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer); if (ddqp->d_rtbtimer) qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); if (ddqp->d_bwarns) qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns); if (ddqp->d_iwarns) qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns); if (ddqp->d_rtbwarns) qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); xfs_qm_dqdestroy(dqp); } /* * This initializes all the quota information that's kept in the * mount structure */ STATIC int xfs_qm_init_quotainfo( struct xfs_mount *mp) { struct xfs_quotainfo *qinf; int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); error = list_lru_init(&qinf->qi_lru); if (error) goto out_free_qinf; /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ error = xfs_qm_init_quotainos(mp); if (error) goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); mutex_init(&qinf->qi_tree_lock); /* mutex used to serialize quotaoffs */ mutex_init(&qinf->qi_quotaofflock); /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); xfs_qm_init_timelimits(mp, qinf); if (XFS_IS_UQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); if (XFS_IS_GQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_GROUP, qinf); if (XFS_IS_PQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf); qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; error = register_shrinker(&qinf->qi_shrinker); if (error) goto out_free_inos; return 0; out_free_inos: mutex_destroy(&qinf->qi_quotaofflock); mutex_destroy(&qinf->qi_tree_lock); xfs_qm_destroy_quotainos(qinf); out_free_lru: list_lru_destroy(&qinf->qi_lru); out_free_qinf: kmem_free(qinf); mp->m_quotainfo = NULL; return error; } /* * Gets called when unmounting a filesystem or when all quotas get * turned off. * This purges the quota inodes, destroys locks and frees itself. */ void xfs_qm_destroy_quotainfo( xfs_mount_t *mp) { xfs_quotainfo_t *qi; qi = mp->m_quotainfo; ASSERT(qi != NULL); unregister_shrinker(&qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); xfs_qm_destroy_quotainos(qi); mutex_destroy(&qi->qi_tree_lock); mutex_destroy(&qi->qi_quotaofflock); kmem_free(qi); mp->m_quotainfo = NULL; } /* * Create an inode and return with a reference already taken, but unlocked * This is how we create quota inodes */ STATIC int xfs_qm_qino_alloc( xfs_mount_t *mp, xfs_inode_t **ip, uint flags) { xfs_trans_t *tp; int error; bool need_alloc = true; *ip = NULL; /* * With superblock that doesn't have separate pquotino, we * share an inode between gquota and pquota. If the on-disk * superblock has GQUOTA and the filesystem is now mounted * with PQUOTA, just use sb_gquotino for sb_pquotino and * vice-versa. */ if (!xfs_sb_version_has_pquotino(&mp->m_sb) && (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { xfs_ino_t ino = NULLFSINO; if ((flags & XFS_QMOPT_PQUOTA) && (mp->m_sb.sb_gquotino != NULLFSINO)) { ino = mp->m_sb.sb_gquotino; ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); } else if ((flags & XFS_QMOPT_GQUOTA) && (mp->m_sb.sb_pquotino != NULLFSINO)) { ino = mp->m_sb.sb_pquotino; ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); } if (ino != NULLFSINO) { error = xfs_iget(mp, NULL, ino, 0, 0, ip); if (error) return error; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; need_alloc = false; } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); if (error) return error; if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip); if (error) { xfs_trans_cancel(tp); return error; } } /* * Make the changes in the superblock, and log those too. * sbfields arg may contain fields other than *QUOTINO; * VERSIONNUM for example. */ spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); xfs_sb_version_addquota(&mp->m_sb); mp->m_sb.sb_uquotino = NULLFSINO; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; /* qflags will get updated fully _after_ quotacheck */ mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; } if (flags & XFS_QMOPT_UQUOTA) mp->m_sb.sb_uquotino = (*ip)->i_ino; else if (flags & XFS_QMOPT_GQUOTA) mp->m_sb.sb_gquotino = (*ip)->i_ino; else mp->m_sb.sb_pquotino = (*ip)->i_ino; spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); } if (need_alloc) xfs_finish_inode_setup(*ip); return error; } STATIC void xfs_qm_reset_dqcounts( xfs_mount_t *mp, xfs_buf_t *bp, xfs_dqid_t id, uint type) { struct xfs_dqblk *dqb; int j; xfs_failaddr_t fa; trace_xfs_reset_dqcounts(bp, _RET_IP_); /* * Reset all counters and timers. They'll be * started afresh by xfs_qm_quotacheck. */ #ifdef DEBUG j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) / sizeof(xfs_dqblk_t); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif dqb = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { struct xfs_disk_dquot *ddq; ddq = (struct xfs_disk_dquot *)&dqb[j]; /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to * find uninitialised dquot blks. See comment in * xfs_dquot_verify. */ fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); if (fa) xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* * Reset type in case we are reusing group quota file for * project quotas or vice versa */ ddq->d_flags = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; ddq->d_btimer = 0; ddq->d_itimer = 0; ddq->d_rtbtimer = 0; ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)&dqb[j], sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } } } STATIC int xfs_qm_reset_dqcounts_all( struct xfs_mount *mp, xfs_dqid_t firstid, xfs_fsblock_t bno, xfs_filblks_t blkcnt, uint flags, struct list_head *buffer_list) { struct xfs_buf *bp; int error; int type; ASSERT(blkcnt > 0); type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); error = 0; /* * Blkcnt arg can be a very big number, and might even be * larger than the log itself. So, we have to break it up into * manageable-sized transactions. * Note that we don't start a permanent transaction here; we might * not be able to get a log reservation for the whole thing up front, * and we don't really care to either, because we just discard * everything if we were to crash in the middle of this loop. */ while (blkcnt--) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); /* * CRC and validation errors will return a EFSCORRUPTED here. If * this occurs, re-read without CRC validation so that we can * repair the damage via xfs_qm_reset_dqcounts(). This process * will leave a trace in the log indicating corruption has * been detected. */ if (error == -EFSCORRUPTED) { error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); } if (error) break; /* * A corrupt buffer might not have a verifier attached, so * make sure we have the correct one attached before writeback * occurs. */ bp->b_ops = &xfs_dquot_buf_ops; xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); /* goto the next block. */ bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } return error; } /* * Iterate over all allocated dquot blocks in this quota inode, zeroing all * counters for every chunk of dquots that we find. */ STATIC int xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, uint flags, struct list_head *buffer_list) { struct xfs_bmbt_irec *map; int i, nmaps; /* number of map entries */ int error; /* return value */ xfs_fileoff_t lblkno; xfs_filblks_t maxlblkcnt; xfs_dqid_t firstid; xfs_fsblock_t rablkno; xfs_filblks_t rablkcnt; error = 0; /* * This looks racy, but we can't keep an inode lock across a * trans_reserve. But, this gets called during quotacheck, and that * happens only at mount time which is single threaded. */ if (qip->i_d.di_nblocks == 0) return 0; map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); do { uint lock_mode; nmaps = XFS_DQITER_MAP_SIZE; /* * We aren't changing the inode itself. Just changing * some of its data. No new blocks are added here, and * the inode is never added to the transaction. */ lock_mode = xfs_ilock_data_map_shared(qip); error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, map, &nmaps, 0); xfs_iunlock(qip, lock_mode); if (error) break; ASSERT(nmaps <= XFS_DQITER_MAP_SIZE); for (i = 0; i < nmaps; i++) { ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); ASSERT(map[i].br_blockcount); lblkno += map[i].br_blockcount; if (map[i].br_startblock == HOLESTARTBLOCK) continue; firstid = (xfs_dqid_t) map[i].br_startoff * mp->m_quotainfo->qi_dqperchunk; /* * Do a read-ahead on the next extent. */ if ((i+1 < nmaps) && (map[i+1].br_startblock != HOLESTARTBLOCK)) { rablkcnt = map[i+1].br_blockcount; rablkno = map[i+1].br_startblock; while (rablkcnt--) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, rablkno), mp->m_quotainfo->qi_dqchunklen, &xfs_dquot_buf_ops); rablkno++; } } /* * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ error = xfs_qm_reset_dqcounts_all(mp, firstid, map[i].br_startblock, map[i].br_blockcount, flags, buffer_list); if (error) goto out; } } while (nmaps > 0); out: kmem_free(map); return error; } /* * Called by dqusage_adjust in doing a quotacheck. * * Given the inode, and a dquot id this updates both the incore dqout as well * as the buffer copy. This is so that once the quotacheck is done, we can * just log all the buffers, as opposed to logging numerous updates to * individual dquots. */ STATIC int xfs_qm_quotacheck_dqadjust( struct xfs_inode *ip, uint type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; xfs_dqid_t id; int error; id = xfs_qm_id_for_quotatype(ip, type); error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { /* * Shouldn't be able to turn off quotas here. */ ASSERT(error != -ESRCH); ASSERT(error != -ENOENT); return error; } trace_xfs_dqadjust(dqp); /* * Adjust the inode count and the block count to reflect this inode's * resource usage. */ be64_add_cpu(&dqp->q_core.d_icount, 1); dqp->q_res_icount++; if (nblks) { be64_add_cpu(&dqp->q_core.d_bcount, nblks); dqp->q_res_bcount += nblks; } if (rtblks) { be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); dqp->q_res_rtbcount += rtblks; } /* * Set default limits, adjust timers (since we changed usages) * * There are no timers for the default values set in the root dquot. */ if (dqp->q_core.d_id) { xfs_qm_adjust_dqlimits(mp, dqp); xfs_qm_adjust_dqtimers(mp, &dqp->q_core); } dqp->dq_flags |= XFS_DQ_DIRTY; xfs_qm_dqput(dqp); return 0; } /* * callback routine supplied to bulkstat(). Given an inumber, find its * dquots and update them to account for resources taken by that inode. */ /* ARGSUSED */ STATIC int xfs_qm_dqusage_adjust( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* not used */ int ubsize, /* not used */ int *ubused, /* not used */ int *res) /* result code value */ { xfs_inode_t *ip; xfs_qcnt_t nblks; xfs_filblks_t rtblks = 0; /* total rt blks */ int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); /* * rootino must have its resources accounted for, not so with the quota * inodes. */ if (xfs_is_quota_inode(&mp->m_sb, ino)) { *res = BULKSTAT_RV_NOTHING; return -EINVAL; } /* * We don't _need_ to take the ilock EXCL here because quotacheck runs * at mount time and therefore nobody will be racing chown/chproj. */ error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip); if (error) { *res = BULKSTAT_RV_NOTHING; return error; } ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto error0; } xfs_bmap_count_leaves(ifp, &rtblks); } nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; /* * Add the (disk blocks and inode) resources occupied by this * inode to its dquots. We do this adjustment in the incore dquot, * and also copy the changes to its buffer. * We don't care about putting these changes in a transaction * envelope because if we crash in the middle of a 'quotacheck' * we have to start from the beginning anyway. * Once we're done, we'll log all the dquot bufs. * * The *QUOTA_ON checks below may look pretty racy, but quotachecks * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, rtblks); if (error) goto error0; } if (XFS_IS_GQUOTA_ON(mp)) { error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, rtblks); if (error) goto error0; } if (XFS_IS_PQUOTA_ON(mp)) { error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, rtblks); if (error) goto error0; } xfs_irele(ip); *res = BULKSTAT_RV_DIDONE; return 0; error0: xfs_irele(ip); *res = BULKSTAT_RV_GIVEUP; return error; } STATIC int xfs_qm_flush_one( struct xfs_dquot *dqp, void *data) { struct xfs_mount *mp = dqp->q_mount; struct list_head *buffer_list = data; struct xfs_buf *bp = NULL; int error = 0; xfs_dqlock(dqp); if (dqp->dq_flags & XFS_DQ_FREEING) goto out_unlock; if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; /* * The only way the dquot is already flush locked by the time quotacheck * gets here is if reclaim flushed it before the dqadjust walk dirtied * it for the final time. Quotacheck collects all dquot bufs in the * local delwri queue before dquots are dirtied, so reclaim can't have * possibly queued it for I/O. The only way out is to push the buffer to * cycle the flush lock. */ if (!xfs_dqflock_nowait(dqp)) { /* buf is pinned in-core by delwri list */ bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { error = -EINVAL; goto out_unlock; } xfs_buf_unlock(bp); xfs_buf_delwri_pushbuf(bp, buffer_list); xfs_buf_rele(bp); error = -EAGAIN; goto out_unlock; } error = xfs_qm_dqflush(dqp, &bp); if (error) goto out_unlock; xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: xfs_dqunlock(dqp); return error; } /* * Walk thru all the filesystem inodes and construct a consistent view * of the disk quota world. If the quotacheck fails, disable quotas. */ STATIC int xfs_qm_quotacheck( xfs_mount_t *mp) { int done, count, error, error2; xfs_ino_t lastino; size_t structsz; uint flags; LIST_HEAD (buffer_list); struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip; count = INT_MAX; structsz = 1; lastino = 0; flags = 0; ASSERT(uip || gip || pip); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); xfs_notice(mp, "Quotacheck needed: Please wait."); /* * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset * their counters to zero. We need a clean slate. * We don't log our changes till later. */ if (uip) { error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, &buffer_list); if (error) goto error_return; flags |= XFS_UQUOTA_CHKD; } if (gip) { error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, &buffer_list); if (error) goto error_return; flags |= XFS_GQUOTA_CHKD; } if (pip) { error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, &buffer_list); if (error) goto error_return; flags |= XFS_PQUOTA_CHKD; } do { /* * Iterate thru all the inodes in the file system, * adjusting the corresponding dquot counters in core. */ error = xfs_bulkstat(mp, &lastino, &count, xfs_qm_dqusage_adjust, structsz, NULL, &done); if (error) break; } while (!done); /* * We've made all the changes that we need to make incore. Flush them * down to disk buffers if everything was updated successfully. */ if (XFS_IS_UQUOTA_ON(mp)) { error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, &buffer_list); } if (XFS_IS_GQUOTA_ON(mp)) { error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; } if (XFS_IS_PQUOTA_ON(mp)) { error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, &buffer_list); if (!error) error = error2; } error2 = xfs_buf_delwri_submit(&buffer_list); if (!error) error = error2; /* * We can get this error if we couldn't do a dquot allocation inside * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the * dirty dquots that might be cached, we just want to get rid of them * and turn quotaoff. The dquots won't be attached to any of the inodes * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); goto error_return; } /* * If one type of quotas is off, then it will lose its * quotachecked status, since we won't be doing accounting for * that type anymore. */ mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; mp->m_qflags |= flags; error_return: xfs_buf_delwri_cancel(&buffer_list); if (error) { xfs_warn(mp, "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", error); /* * We must turn off quotas. */ ASSERT(mp->m_quotainfo != NULL); xfs_qm_destroy_quotainfo(mp); if (xfs_mount_reset_sbqflags(mp)) { xfs_warn(mp, "Quotacheck: Failed to reset quota flags."); } } else xfs_notice(mp, "Quotacheck: Done."); return error; } /* * This is called from xfs_mountfs to start quotas and initialize all * necessary data structures like quotainfo. This is also responsible for * running a quotacheck as necessary. We are guaranteed that the superblock * is consistently read in at this point. * * If we fail here, the mount will continue with quota turned off. We don't * need to inidicate success or failure at all. */ void xfs_qm_mount_quotas( struct xfs_mount *mp) { int error = 0; uint sbf; /* * If quotas on realtime volumes is not supported, we disable * quotas immediately. */ if (mp->m_sb.sb_rextents) { xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); mp->m_qflags = 0; goto write_changes; } ASSERT(XFS_IS_QUOTA_RUNNING(mp)); /* * Allocate the quotainfo structure inside the mount struct, and * create quotainode(s), and change/rev superblock if necessary. */ error = xfs_qm_init_quotainfo(mp); if (error) { /* * We must turn off quotas. */ ASSERT(mp->m_quotainfo == NULL); mp->m_qflags = 0; goto write_changes; } /* * If any of the quotas are not consistent, do a quotacheck. */ if (XFS_QM_NEED_QUOTACHECK(mp)) { error = xfs_qm_quotacheck(mp); if (error) { /* Quotacheck failed and disabled quotas. */ return; } } /* * If one type of quotas is off, then it will lose its * quotachecked status, since we won't be doing accounting for * that type anymore. */ if (!XFS_IS_UQUOTA_ON(mp)) mp->m_qflags &= ~XFS_UQUOTA_CHKD; if (!XFS_IS_GQUOTA_ON(mp)) mp->m_qflags &= ~XFS_GQUOTA_CHKD; if (!XFS_IS_PQUOTA_ON(mp)) mp->m_qflags &= ~XFS_PQUOTA_CHKD; write_changes: /* * We actually don't have to acquire the m_sb_lock at all. * This can only be called from mount, and that's single threaded. XXX */ spin_lock(&mp->m_sb_lock); sbf = mp->m_sb.sb_qflags; mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; spin_unlock(&mp->m_sb_lock); if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { if (xfs_sync_sb(mp, false)) { /* * We could only have been turning quotas off. * We aren't in very good shape actually because * the incore structures are convinced that quotas are * off, but the on disk superblock doesn't know that ! */ ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); xfs_alert(mp, "%s: Superblock update failed!", __func__); } } if (error) { xfs_warn(mp, "Failed to initialize disk quotas."); return; } } /* * This is called after the superblock has been read in and we're ready to * iget the quota inodes. */ STATIC int xfs_qm_init_quotainos( xfs_mount_t *mp) { struct xfs_inode *uip = NULL; struct xfs_inode *gip = NULL; struct xfs_inode *pip = NULL; int error; uint flags = 0; ASSERT(mp->m_quotainfo); /* * Get the uquota and gquota inodes */ if (xfs_sb_version_hasquota(&mp->m_sb)) { if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip); if (error) return error; } if (XFS_IS_GQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &gip); if (error) goto error_rele; } if (XFS_IS_PQUOTA_ON(mp) && mp->m_sb.sb_pquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_pquotino > 0); error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, 0, 0, &pip); if (error) goto error_rele; } } else { flags |= XFS_QMOPT_SBVERSION; } /* * Create the three inodes, if they don't exist already. The changes * made above will get added to a transaction and logged in one of * the qino_alloc calls below. If the device is readonly, * temporarily switch to read-write to do this. */ if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { error = xfs_qm_qino_alloc(mp, &uip, flags | XFS_QMOPT_UQUOTA); if (error) goto error_rele; flags &= ~XFS_QMOPT_SBVERSION; } if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { error = xfs_qm_qino_alloc(mp, &gip, flags | XFS_QMOPT_GQUOTA); if (error) goto error_rele; flags &= ~XFS_QMOPT_SBVERSION; } if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { error = xfs_qm_qino_alloc(mp, &pip, flags | XFS_QMOPT_PQUOTA); if (error) goto error_rele; } mp->m_quotainfo->qi_uquotaip = uip; mp->m_quotainfo->qi_gquotaip = gip; mp->m_quotainfo->qi_pquotaip = pip; return 0; error_rele: if (uip) xfs_irele(uip); if (gip) xfs_irele(gip); if (pip) xfs_irele(pip); return error; } STATIC void xfs_qm_destroy_quotainos( xfs_quotainfo_t *qi) { if (qi->qi_uquotaip) { xfs_irele(qi->qi_uquotaip); qi->qi_uquotaip = NULL; /* paranoia */ } if (qi->qi_gquotaip) { xfs_irele(qi->qi_gquotaip); qi->qi_gquotaip = NULL; } if (qi->qi_pquotaip) { xfs_irele(qi->qi_pquotaip); qi->qi_pquotaip = NULL; } } STATIC void xfs_qm_dqfree_one( struct xfs_dquot *dqp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; mutex_lock(&qi->qi_tree_lock); radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), be32_to_cpu(dqp->q_core.d_id)); qi->qi_dquots--; mutex_unlock(&qi->qi_tree_lock); xfs_qm_dqdestroy(dqp); } /* --------------- utility functions for vnodeops ---------------- */ /* * Given an inode, a uid, gid and prid make sure that we have * allocated relevant dquot(s) on disk, and that we won't exceed inode * quotas by creating this file. * This also attaches dquot(s) to the given inode after locking it, * and returns the dquots corresponding to the uid and/or gid. * * in : inode (unlocked) * out : udquot, gdquot with references taken and unlocked */ int xfs_qm_vop_dqalloc( struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, struct xfs_dquot **O_gdqpp, struct xfs_dquot **O_pdqpp) { struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *uq = NULL; struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; int error; uint lockflags; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; lockflags = XFS_ILOCK_EXCL; xfs_ilock(ip, lockflags); if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) gid = ip->i_d.di_gid; /* * Attach the dquot(s) to this inode, doing a dquot allocation * if necessary. The dquot(s) will not be locked. */ if (XFS_NOT_DQATTACHED(mp, ip)) { error = xfs_qm_dqattach_locked(ip, true); if (error) { xfs_iunlock(ip, lockflags); return error; } } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { if (ip->i_d.di_uid != uid) { /* * What we need is the dquot that has this uid, and * if we send the inode to dqget, the uid of the inode * takes priority over what's sent in the uid argument. * We must unlock inode here before calling dqget if * we're not sending the inode, because otherwise * we'll deadlock by doing trans_reserve while * holding ilock. */ xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; } /* * Get the ilock in the right order. */ xfs_dqunlock(uq); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { /* * Take an extra reference, because we'll return * this to caller */ ASSERT(ip->i_udquot); uq = xfs_qm_dqhold(ip->i_udquot); } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } xfs_dqunlock(gq); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); gq = xfs_qm_dqhold(ip->i_gdquot); } } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } xfs_dqunlock(pq); lockflags = XFS_ILOCK_SHARED; xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_pdquot); pq = xfs_qm_dqhold(ip->i_pdquot); } } if (uq) trace_xfs_dquot_dqalloc(ip); xfs_iunlock(ip, lockflags); if (O_udqpp) *O_udqpp = uq; else xfs_qm_dqrele(uq); if (O_gdqpp) *O_gdqpp = gq; else xfs_qm_dqrele(gq); if (O_pdqpp) *O_pdqpp = pq; else xfs_qm_dqrele(pq); return 0; error_rele: xfs_qm_dqrele(gq); xfs_qm_dqrele(uq); return error; } /* * Actually transfer ownership, and do dquot modifications. * These were already reserved. */ xfs_dquot_t * xfs_qm_vop_chown( xfs_trans_t *tp, xfs_inode_t *ip, xfs_dquot_t **IO_olddq, xfs_dquot_t *newdq) { xfs_dquot_t *prevdq; uint bfield = XFS_IS_REALTIME_INODE(ip) ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); /* old dquot */ prevdq = *IO_olddq; ASSERT(prevdq); ASSERT(prevdq != newdq); xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); /* the sparkling new dquot */ xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* * Take an extra reference, because the inode is going to keep * this dquot pointer even after the trans_commit. */ *IO_olddq = xfs_qm_dqhold(newdq); return prevdq; } /* * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). */ int xfs_qm_vop_chown_reserve( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, uint flags) { struct xfs_mount *mp = ip->i_mount; uint delblks, blkflags, prjflags = 0; struct xfs_dquot *udq_unres = NULL; struct xfs_dquot *gdq_unres = NULL; struct xfs_dquot *pdq_unres = NULL; struct xfs_dquot *udq_delblks = NULL; struct xfs_dquot *gdq_delblks = NULL; struct xfs_dquot *pdq_delblks = NULL; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); delblks = ip->i_delayed_blks; blkflags = XFS_IS_REALTIME_INODE(ip) ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to * unreserve those from the old dquot, and add them to the * new dquot. */ if (delblks) { ASSERT(ip->i_udquot); udq_unres = ip->i_udquot; } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); gdq_unres = ip->i_gdquot; } } if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { prjflags = XFS_QMOPT_ENOSPC; pdq_delblks = pdqp; if (delblks) { ASSERT(ip->i_pdquot); pdq_unres = ip->i_pdquot; } } error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, ip->i_d.di_nblocks, 1, flags | blkflags | prjflags); if (error) return error; /* * Do the delayed blks reservations/unreservations now. Since, these * are done without the help of a transaction, if a reservation fails * its previous reservations won't be automatically undone by trans * code. So, we have to do it manually here. */ if (delblks) { /* * Do the reservations first. Unreservation can't fail. */ ASSERT(udq_delblks || gdq_delblks || pdq_delblks); ASSERT(udq_unres || gdq_unres || pdq_unres); error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, (xfs_qcnt_t)delblks, 0, flags | blkflags | prjflags); if (error) return error; xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, udq_unres, gdq_unres, pdq_unres, -((xfs_qcnt_t)delblks), 0, blkflags); } return 0; } int xfs_qm_vop_rename_dqattach( struct xfs_inode **i_tab) { struct xfs_mount *mp = i_tab[0]->i_mount; int i; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; for (i = 0; (i < 4 && i_tab[i]); i++) { struct xfs_inode *ip = i_tab[i]; int error; /* * Watch out for duplicate entries in the table. */ if (i == 0 || ip != i_tab[i-1]) { if (XFS_NOT_DQATTACHED(mp, ip)) { error = xfs_qm_dqattach(ip); if (error) return error; } } } return 0; } void xfs_qm_vop_create_dqattach( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp) { struct xfs_mount *mp = tp->t_mountp; if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); ip->i_pdquot = xfs_qm_dqhold(pdqp); xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); } } |
4 4 4 1 1 1 2 2 2 1 1 5 5 5 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 | /* * Media entity * * Copyright (C) 2010 Nokia Corporation * * Contacts: Laurent Pinchart <laurent.pinchart@ideasonboard.com> * Sakari Ailus <sakari.ailus@iki.fi> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include <linux/bitmap.h> #include <linux/module.h> #include <linux/property.h> #include <linux/slab.h> #include <media/media-entity.h> #include <media/media-device.h> static inline const char *gobj_type(enum media_gobj_type type) { switch (type) { case MEDIA_GRAPH_ENTITY: return "entity"; case MEDIA_GRAPH_PAD: return "pad"; case MEDIA_GRAPH_LINK: return "link"; case MEDIA_GRAPH_INTF_DEVNODE: return "intf-devnode"; default: return "unknown"; } } static inline const char *intf_type(struct media_interface *intf) { switch (intf->type) { case MEDIA_INTF_T_DVB_FE: return "dvb-frontend"; case MEDIA_INTF_T_DVB_DEMUX: return "dvb-demux"; case MEDIA_INTF_T_DVB_DVR: return "dvb-dvr"; case MEDIA_INTF_T_DVB_CA: return "dvb-ca"; case MEDIA_INTF_T_DVB_NET: return "dvb-net"; case MEDIA_INTF_T_V4L_VIDEO: return "v4l-video"; case MEDIA_INTF_T_V4L_VBI: return "v4l-vbi"; case MEDIA_INTF_T_V4L_RADIO: return "v4l-radio"; case MEDIA_INTF_T_V4L_SUBDEV: return "v4l-subdev"; case MEDIA_INTF_T_V4L_SWRADIO: return "v4l-swradio"; case MEDIA_INTF_T_V4L_TOUCH: return "v4l-touch"; default: return "unknown-intf"; } }; __must_check int __media_entity_enum_init(struct media_entity_enum *ent_enum, int idx_max) { idx_max = ALIGN(idx_max, BITS_PER_LONG); ent_enum->bmap = kcalloc(idx_max / BITS_PER_LONG, sizeof(long), GFP_KERNEL); if (!ent_enum->bmap) return -ENOMEM; bitmap_zero(ent_enum->bmap, idx_max); ent_enum->idx_max = idx_max; return 0; } EXPORT_SYMBOL_GPL(__media_entity_enum_init); void media_entity_enum_cleanup(struct media_entity_enum *ent_enum) { kfree(ent_enum->bmap); } EXPORT_SYMBOL_GPL(media_entity_enum_cleanup); /** * dev_dbg_obj - Prints in debug mode a change on some object * * @event_name: Name of the event to report. Could be __func__ * @gobj: Pointer to the object * * Enabled only if DEBUG or CONFIG_DYNAMIC_DEBUG. Otherwise, it * won't produce any code. */ static void dev_dbg_obj(const char *event_name, struct media_gobj *gobj) { #if defined(DEBUG) || defined (CONFIG_DYNAMIC_DEBUG) switch (media_type(gobj)) { case MEDIA_GRAPH_ENTITY: dev_dbg(gobj->mdev->dev, "%s id %u: entity '%s'\n", event_name, media_id(gobj), gobj_to_entity(gobj)->name); break; case MEDIA_GRAPH_LINK: { struct media_link *link = gobj_to_link(gobj); dev_dbg(gobj->mdev->dev, "%s id %u: %s link id %u ==> id %u\n", event_name, media_id(gobj), media_type(link->gobj0) == MEDIA_GRAPH_PAD ? "data" : "interface", media_id(link->gobj0), media_id(link->gobj1)); break; } case MEDIA_GRAPH_PAD: { struct media_pad *pad = gobj_to_pad(gobj); dev_dbg(gobj->mdev->dev, "%s id %u: %s%spad '%s':%d\n", event_name, media_id(gobj), pad->flags & MEDIA_PAD_FL_SINK ? "sink " : "", pad->flags & MEDIA_PAD_FL_SOURCE ? "source " : "", pad->entity->name, pad->index); break; } case MEDIA_GRAPH_INTF_DEVNODE: { struct media_interface *intf = gobj_to_intf(gobj); struct media_intf_devnode *devnode = intf_to_devnode(intf); dev_dbg(gobj->mdev->dev, "%s id %u: intf_devnode %s - major: %d, minor: %d\n", event_name, media_id(gobj), intf_type(intf), devnode->major, devnode->minor); break; } } #endif } void media_gobj_create(struct media_device *mdev, enum media_gobj_type type, struct media_gobj *gobj) { BUG_ON(!mdev); gobj->mdev = mdev; /* Create a per-type unique object ID */ gobj->id = media_gobj_gen_id(type, ++mdev->id); switch (type) { case MEDIA_GRAPH_ENTITY: list_add_tail(&gobj->list, &mdev->entities); break; case MEDIA_GRAPH_PAD: list_add_tail(&gobj->list, &mdev->pads); break; case MEDIA_GRAPH_LINK: list_add_tail(&gobj->list, &mdev->links); break; case MEDIA_GRAPH_INTF_DEVNODE: list_add_tail(&gobj->list, &mdev->interfaces); break; } mdev->topology_version++; dev_dbg_obj(__func__, gobj); } void media_gobj_destroy(struct media_gobj *gobj) { /* Do nothing if the object is not linked. */ if (gobj->mdev == NULL) return; dev_dbg_obj(__func__, gobj); gobj->mdev->topology_version++; /* Remove the object from mdev list */ list_del(&gobj->list); gobj->mdev = NULL; } /* * TODO: Get rid of this. */ #define MEDIA_ENTITY_MAX_PADS 512 int media_entity_pads_init(struct media_entity *entity, u16 num_pads, struct media_pad *pads) { struct media_device *mdev = entity->graph_obj.mdev; unsigned int i; if (num_pads >= MEDIA_ENTITY_MAX_PADS) return -E2BIG; entity->num_pads = num_pads; entity->pads = pads; if (mdev) mutex_lock(&mdev->graph_mutex); for (i = 0; i < num_pads; i++) { pads[i].entity = entity; pads[i].index = i; if (mdev) media_gobj_create(mdev, MEDIA_GRAPH_PAD, &entity->pads[i].graph_obj); } if (mdev) mutex_unlock(&mdev->graph_mutex); return 0; } EXPORT_SYMBOL_GPL(media_entity_pads_init); /* ----------------------------------------------------------------------------- * Graph traversal */ static struct media_entity * media_entity_other(struct media_entity *entity, struct media_link *link) { if (link->source->entity == entity) return link->sink->entity; else return link->source->entity; } /* push an entity to traversal stack */ static void stack_push(struct media_graph *graph, struct media_entity *entity) { if (graph->top == MEDIA_ENTITY_ENUM_MAX_DEPTH - 1) { WARN_ON(1); return; } graph->top++; graph->stack[graph->top].link = entity->links.next; graph->stack[graph->top].entity = entity; } static struct media_entity *stack_pop(struct media_graph *graph) { struct media_entity *entity; entity = graph->stack[graph->top].entity; graph->top--; return entity; } #define link_top(en) ((en)->stack[(en)->top].link) #define stack_top(en) ((en)->stack[(en)->top].entity) /** * media_graph_walk_init - Allocate resources for graph walk * @graph: Media graph structure that will be used to walk the graph * @mdev: Media device * * Reserve resources for graph walk in media device's current * state. The memory must be released using * media_graph_walk_free(). * * Returns error on failure, zero on success. */ __must_check int media_graph_walk_init( struct media_graph *graph, struct media_device *mdev) { return media_entity_enum_init(&graph->ent_enum, mdev); } EXPORT_SYMBOL_GPL(media_graph_walk_init); /** * media_graph_walk_cleanup - Release resources related to graph walking * @graph: Media graph structure that was used to walk the graph */ void media_graph_walk_cleanup(struct media_graph *graph) { media_entity_enum_cleanup(&graph->ent_enum); } EXPORT_SYMBOL_GPL(media_graph_walk_cleanup); void media_graph_walk_start(struct media_graph *graph, struct media_entity *entity) { media_entity_enum_zero(&graph->ent_enum); media_entity_enum_set(&graph->ent_enum, entity); graph->top = 0; graph->stack[graph->top].entity = NULL; stack_push(graph, entity); dev_dbg(entity->graph_obj.mdev->dev, "begin graph walk at '%s'\n", entity->name); } EXPORT_SYMBOL_GPL(media_graph_walk_start); static void media_graph_walk_iter(struct media_graph *graph) { struct media_entity *entity = stack_top(graph); struct media_link *link; struct media_entity *next; link = list_entry(link_top(graph), typeof(*link), list); /* The link is not enabled so we do not follow. */ if (!(link->flags & MEDIA_LNK_FL_ENABLED)) { link_top(graph) = link_top(graph)->next; dev_dbg(entity->graph_obj.mdev->dev, "walk: skipping disabled link '%s':%u -> '%s':%u\n", link->source->entity->name, link->source->index, link->sink->entity->name, link->sink->index); return; } /* Get the entity in the other end of the link . */ next = media_entity_other(entity, link); /* Has the entity already been visited? */ if (media_entity_enum_test_and_set(&graph->ent_enum, next)) { link_top(graph) = link_top(graph)->next; dev_dbg(entity->graph_obj.mdev->dev, "walk: skipping entity '%s' (already seen)\n", next->name); return; } /* Push the new entity to stack and start over. */ link_top(graph) = link_top(graph)->next; stack_push(graph, next); dev_dbg(entity->graph_obj.mdev->dev, "walk: pushing '%s' on stack\n", next->name); } struct media_entity *media_graph_walk_next(struct media_graph *graph) { struct media_entity *entity; if (stack_top(graph) == NULL) return NULL; /* * Depth first search. Push entity to stack and continue from * top of the stack until no more entities on the level can be * found. */ while (link_top(graph) != &stack_top(graph)->links) media_graph_walk_iter(graph); entity = stack_pop(graph); dev_dbg(entity->graph_obj.mdev->dev, "walk: returning entity '%s'\n", entity->name); return entity; } EXPORT_SYMBOL_GPL(media_graph_walk_next); int media_entity_get_fwnode_pad(struct media_entity *entity, struct fwnode_handle *fwnode, unsigned long direction_flags) { struct fwnode_endpoint endpoint; unsigned int i; int ret; if (!entity->ops || !entity->ops->get_fwnode_pad) { for (i = 0; i < entity->num_pads; i++) { if (entity->pads[i].flags & direction_flags) return i; } return -ENXIO; } ret = fwnode_graph_parse_endpoint(fwnode, &endpoint); if (ret) return ret; ret = entity->ops->get_fwnode_pad(&endpoint); if (ret < 0) return ret; if (ret >= entity->num_pads) return -ENXIO; if (!(entity->pads[ret].flags & direction_flags)) return -ENXIO; return ret; } EXPORT_SYMBOL_GPL(media_entity_get_fwnode_pad); /* ----------------------------------------------------------------------------- * Pipeline management */ __must_check int __media_pipeline_start(struct media_entity *entity, struct media_pipeline *pipe) { struct media_device *mdev = entity->graph_obj.mdev; struct media_graph *graph = &pipe->graph; struct media_entity *entity_err = entity; struct media_link *link; int ret; if (!pipe->streaming_count++) { ret = media_graph_walk_init(&pipe->graph, mdev); if (ret) goto error_graph_walk_start; } media_graph_walk_start(&pipe->graph, entity); while ((entity = media_graph_walk_next(graph))) { DECLARE_BITMAP(active, MEDIA_ENTITY_MAX_PADS); DECLARE_BITMAP(has_no_links, MEDIA_ENTITY_MAX_PADS); entity->stream_count++; if (WARN_ON(entity->pipe && entity->pipe != pipe)) { ret = -EBUSY; goto error; } entity->pipe = pipe; /* Already streaming --- no need to check. */ if (entity->stream_count > 1) continue; if (!entity->ops || !entity->ops->link_validate) continue; bitmap_zero(active, entity->num_pads); bitmap_fill(has_no_links, entity->num_pads); list_for_each_entry(link, &entity->links, list) { struct media_pad *pad = link->sink->entity == entity ? link->sink : link->source; /* Mark that a pad is connected by a link. */ bitmap_clear(has_no_links, pad->index, 1); /* * Pads that either do not need to connect or * are connected through an enabled link are * fine. */ if (!(pad->flags & MEDIA_PAD_FL_MUST_CONNECT) || link->flags & MEDIA_LNK_FL_ENABLED) bitmap_set(active, pad->index, 1); /* * Link validation will only take place for * sink ends of the link that are enabled. */ if (link->sink != pad || !(link->flags & MEDIA_LNK_FL_ENABLED)) continue; ret = entity->ops->link_validate(link); if (ret < 0 && ret != -ENOIOCTLCMD) { dev_dbg(entity->graph_obj.mdev->dev, "link validation failed for '%s':%u -> '%s':%u, error %d\n", link->source->entity->name, link->source->index, entity->name, link->sink->index, ret); goto error; } } /* Either no links or validated links are fine. */ bitmap_or(active, active, has_no_links, entity->num_pads); if (!bitmap_full(active, entity->num_pads)) { ret = -ENOLINK; dev_dbg(entity->graph_obj.mdev->dev, "'%s':%u must be connected by an enabled link\n", entity->name, (unsigned)find_first_zero_bit( active, entity->num_pads)); goto error; } } return 0; error: /* * Link validation on graph failed. We revert what we did and * return the error. */ media_graph_walk_start(graph, entity_err); while ((entity_err = media_graph_walk_next(graph))) { /* Sanity check for negative stream_count */ if (!WARN_ON_ONCE(entity_err->stream_count <= 0)) { entity_err->stream_count--; if (entity_err->stream_count == 0) entity_err->pipe = NULL; } /* * We haven't increased stream_count further than this * so we quit here. */ if (entity_err == entity) break; } error_graph_walk_start: if (!--pipe->streaming_count) media_graph_walk_cleanup(graph); return ret; } EXPORT_SYMBOL_GPL(__media_pipeline_start); __must_check int media_pipeline_start(struct media_entity *entity, struct media_pipeline *pipe) { struct media_device *mdev = entity->graph_obj.mdev; int ret; mutex_lock(&mdev->graph_mutex); ret = __media_pipeline_start(entity, pipe); mutex_unlock(&mdev->graph_mutex); return ret; } EXPORT_SYMBOL_GPL(media_pipeline_start); void __media_pipeline_stop(struct media_entity *entity) { struct media_graph *graph = &entity->pipe->graph; struct media_pipeline *pipe = entity->pipe; /* * If the following check fails, the driver has performed an * unbalanced call to media_pipeline_stop() */ if (WARN_ON(!pipe)) return; media_graph_walk_start(graph, entity); while ((entity = media_graph_walk_next(graph))) { /* Sanity check for negative stream_count */ if (!WARN_ON_ONCE(entity->stream_count <= 0)) { entity->stream_count--; if (entity->stream_count == 0) entity->pipe = NULL; } } if (!--pipe->streaming_count) media_graph_walk_cleanup(graph); } EXPORT_SYMBOL_GPL(__media_pipeline_stop); void media_pipeline_stop(struct media_entity *entity) { struct media_device *mdev = entity->graph_obj.mdev; mutex_lock(&mdev->graph_mutex); __media_pipeline_stop(entity); mutex_unlock(&mdev->graph_mutex); } EXPORT_SYMBOL_GPL(media_pipeline_stop); /* ----------------------------------------------------------------------------- * Module use count */ struct media_entity *media_entity_get(struct media_entity *entity) { if (entity == NULL) return NULL; if (entity->graph_obj.mdev->dev && !try_module_get(entity->graph_obj.mdev->dev->driver->owner)) return NULL; return entity; } EXPORT_SYMBOL_GPL(media_entity_get); void media_entity_put(struct media_entity *entity) { if (entity == NULL) return; if (entity->graph_obj.mdev->dev) module_put(entity->graph_obj.mdev->dev->driver->owner); } EXPORT_SYMBOL_GPL(media_entity_put); /* ----------------------------------------------------------------------------- * Links management */ static struct media_link *media_add_link(struct list_head *head) { struct media_link *link; link = kzalloc(sizeof(*link), GFP_KERNEL); if (link == NULL) return NULL; list_add_tail(&link->list, head); return link; } static void __media_entity_remove_link(struct media_entity *entity, struct media_link *link) { struct media_link *rlink, *tmp; struct media_entity *remote; if (link->source->entity == entity) remote = link->sink->entity; else remote = link->source->entity; list_for_each_entry_safe(rlink, tmp, &remote->links, list) { if (rlink != link->reverse) continue; if (link->source->entity == entity) remote->num_backlinks--; /* Remove the remote link */ list_del(&rlink->list); media_gobj_destroy(&rlink->graph_obj); kfree(rlink); if (--remote->num_links == 0) break; } list_del(&link->list); media_gobj_destroy(&link->graph_obj); kfree(link); } int media_create_pad_link(struct media_entity *source, u16 source_pad, struct media_entity *sink, u16 sink_pad, u32 flags) { struct media_link *link; struct media_link *backlink; BUG_ON(source == NULL || sink == NULL); BUG_ON(source_pad >= source->num_pads); BUG_ON(sink_pad >= sink->num_pads); link = media_add_link(&source->links); if (link == NULL) return -ENOMEM; link->source = &source->pads[source_pad]; link->sink = &sink->pads[sink_pad]; link->flags = flags & ~MEDIA_LNK_FL_INTERFACE_LINK; /* Initialize graph object embedded at the new link */ media_gobj_create(source->graph_obj.mdev, MEDIA_GRAPH_LINK, &link->graph_obj); /* Create the backlink. Backlinks are used to help graph traversal and * are not reported to userspace. */ backlink = media_add_link(&sink->links); if (backlink == NULL) { __media_entity_remove_link(source, link); return -ENOMEM; } backlink->source = &source->pads[source_pad]; backlink->sink = &sink->pads[sink_pad]; backlink->flags = flags; backlink->is_backlink = true; /* Initialize graph object embedded at the new link */ media_gobj_create(sink->graph_obj.mdev, MEDIA_GRAPH_LINK, &backlink->graph_obj); link->reverse = backlink; backlink->reverse = link; sink->num_backlinks++; sink->num_links++; source->num_links++; return 0; } EXPORT_SYMBOL_GPL(media_create_pad_link); int media_create_pad_links(const struct media_device *mdev, const u32 source_function, struct media_entity *source, const u16 source_pad, const u32 sink_function, struct media_entity *sink, const u16 sink_pad, u32 flags, const bool allow_both_undefined) { struct media_entity *entity; unsigned function; int ret; /* Trivial case: 1:1 relation */ if (source && sink) return media_create_pad_link(source, source_pad, sink, sink_pad, flags); /* Worse case scenario: n:n relation */ if (!source && !sink) { if (!allow_both_undefined) return 0; media_device_for_each_entity(source, mdev) { if (source->function != source_function) continue; media_device_for_each_entity(sink, mdev) { if (sink->function != sink_function) continue; ret = media_create_pad_link(source, source_pad, sink, sink_pad, flags); if (ret) return ret; flags &= ~(MEDIA_LNK_FL_ENABLED | MEDIA_LNK_FL_IMMUTABLE); } } return 0; } /* Handle 1:n and n:1 cases */ if (source) function = sink_function; else function = source_function; media_device_for_each_entity(entity, mdev) { if (entity->function != function) continue; if (source) ret = media_create_pad_link(source, source_pad, entity, sink_pad, flags); else ret = media_create_pad_link(entity, source_pad, sink, sink_pad, flags); if (ret) return ret; flags &= ~(MEDIA_LNK_FL_ENABLED | MEDIA_LNK_FL_IMMUTABLE); } return 0; } EXPORT_SYMBOL_GPL(media_create_pad_links); void __media_entity_remove_links(struct media_entity *entity) { struct media_link *link, *tmp; list_for_each_entry_safe(link, tmp, &entity->links, list) __media_entity_remove_link(entity, link); entity->num_links = 0; entity->num_backlinks = 0; } EXPORT_SYMBOL_GPL(__media_entity_remove_links); void media_entity_remove_links(struct media_entity *entity) { struct media_device *mdev = entity->graph_obj.mdev; /* Do nothing if the entity is not registered. */ if (mdev == NULL) return; mutex_lock(&mdev->graph_mutex); __media_entity_remove_links(entity); mutex_unlock(&mdev->graph_mutex); } EXPORT_SYMBOL_GPL(media_entity_remove_links); static int __media_entity_setup_link_notify(struct media_link *link, u32 flags) { int ret; /* Notify both entities. */ ret = media_entity_call(link->source->entity, link_setup, link->source, link->sink, flags); if (ret < 0 && ret != -ENOIOCTLCMD) return ret; ret = media_entity_call(link->sink->entity, link_setup, link->sink, link->source, flags); if (ret < 0 && ret != -ENOIOCTLCMD) { media_entity_call(link->source->entity, link_setup, link->source, link->sink, link->flags); return ret; } link->flags = flags; link->reverse->flags = link->flags; return 0; } int __media_entity_setup_link(struct media_link *link, u32 flags) { const u32 mask = MEDIA_LNK_FL_ENABLED; struct media_device *mdev; struct media_entity *source, *sink; int ret = -EBUSY; if (link == NULL) return -EINVAL; /* The non-modifiable link flags must not be modified. */ if ((link->flags & ~mask) != (flags & ~mask)) return -EINVAL; if (link->flags & MEDIA_LNK_FL_IMMUTABLE) return link->flags == flags ? 0 : -EINVAL; if (link->flags == flags) return 0; source = link->source->entity; sink = link->sink->entity; if (!(link->flags & MEDIA_LNK_FL_DYNAMIC) && (source->stream_count || sink->stream_count)) return -EBUSY; mdev = source->graph_obj.mdev; if (mdev->ops && mdev->ops->link_notify) { ret = mdev->ops->link_notify(link, flags, MEDIA_DEV_NOTIFY_PRE_LINK_CH); if (ret < 0) return ret; } ret = __media_entity_setup_link_notify(link, flags); if (mdev->ops && mdev->ops->link_notify) mdev->ops->link_notify(link, flags, MEDIA_DEV_NOTIFY_POST_LINK_CH); return ret; } EXPORT_SYMBOL_GPL(__media_entity_setup_link); int media_entity_setup_link(struct media_link *link, u32 flags) { int ret; mutex_lock(&link->graph_obj.mdev->graph_mutex); ret = __media_entity_setup_link(link, flags); mutex_unlock(&link->graph_obj.mdev->graph_mutex); return ret; } EXPORT_SYMBOL_GPL(media_entity_setup_link); struct media_link * media_entity_find_link(struct media_pad *source, struct media_pad *sink) { struct media_link *link; list_for_each_entry(link, &source->entity->links, list) { if (link->source->entity == source->entity && link->source->index == source->index && link->sink->entity == sink->entity && link->sink->index == sink->index) return link; } return NULL; } EXPORT_SYMBOL_GPL(media_entity_find_link); struct media_pad *media_entity_remote_pad(const struct media_pad *pad) { struct media_link *link; list_for_each_entry(link, &pad->entity->links, list) { if (!(link->flags & MEDIA_LNK_FL_ENABLED)) continue; if (link->source == pad) return link->sink; if (link->sink == pad) return link->source; } return NULL; } EXPORT_SYMBOL_GPL(media_entity_remote_pad); static void media_interface_init(struct media_device *mdev, struct media_interface *intf, u32 gobj_type, u32 intf_type, u32 flags) { intf->type = intf_type; intf->flags = flags; INIT_LIST_HEAD(&intf->links); media_gobj_create(mdev, gobj_type, &intf->graph_obj); } /* Functions related to the media interface via device nodes */ struct media_intf_devnode *media_devnode_create(struct media_device *mdev, u32 type, u32 flags, u32 major, u32 minor) { struct media_intf_devnode *devnode; devnode = kzalloc(sizeof(*devnode), GFP_KERNEL); if (!devnode) return NULL; devnode->major = major; devnode->minor = minor; media_interface_init(mdev, &devnode->intf, MEDIA_GRAPH_INTF_DEVNODE, type, flags); return devnode; } EXPORT_SYMBOL_GPL(media_devnode_create); void media_devnode_remove(struct media_intf_devnode *devnode) { media_remove_intf_links(&devnode->intf); media_gobj_destroy(&devnode->intf.graph_obj); kfree(devnode); } EXPORT_SYMBOL_GPL(media_devnode_remove); struct media_link *media_create_intf_link(struct media_entity *entity, struct media_interface *intf, u32 flags) { struct media_link *link; link = media_add_link(&intf->links); if (link == NULL) return NULL; link->intf = intf; link->entity = entity; link->flags = flags | MEDIA_LNK_FL_INTERFACE_LINK; /* Initialize graph object embedded at the new link */ media_gobj_create(intf->graph_obj.mdev, MEDIA_GRAPH_LINK, &link->graph_obj); return link; } EXPORT_SYMBOL_GPL(media_create_intf_link); void __media_remove_intf_link(struct media_link *link) { list_del(&link->list); media_gobj_destroy(&link->graph_obj); kfree(link); } EXPORT_SYMBOL_GPL(__media_remove_intf_link); void media_remove_intf_link(struct media_link *link) { struct media_device *mdev = link->graph_obj.mdev; /* Do nothing if the intf is not registered. */ if (mdev == NULL) return; mutex_lock(&mdev->graph_mutex); __media_remove_intf_link(link); mutex_unlock(&mdev->graph_mutex); } EXPORT_SYMBOL_GPL(media_remove_intf_link); void __media_remove_intf_links(struct media_interface *intf) { struct media_link *link, *tmp; list_for_each_entry_safe(link, tmp, &intf->links, list) __media_remove_intf_link(link); } EXPORT_SYMBOL_GPL(__media_remove_intf_links); void media_remove_intf_links(struct media_interface *intf) { struct media_device *mdev = intf->graph_obj.mdev; /* Do nothing if the intf is not registered. */ if (mdev == NULL) return; mutex_lock(&mdev->graph_mutex); __media_remove_intf_links(intf); mutex_unlock(&mdev->graph_mutex); } EXPORT_SYMBOL_GPL(media_remove_intf_links); |
10 7 6 4 2 2 2 4 10 5 4 2 626 625 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | /* * Stateless NAT actions * * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/tc_act/tc_nat.h> #include <net/act_api.h> #include <net/icmp.h> #include <net/ip.h> #include <net/netlink.h> #include <net/tc_act/tc_nat.h> #include <net/tcp.h> #include <net/udp.h> static unsigned int nat_net_id; static struct tc_action_ops act_nat_ops; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { [TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) }, }; static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; struct tc_nat *parm; int ret = 0, err; struct tcf_nat *p; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy, NULL); if (err < 0) return err; if (tb[TCA_NAT_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create(tn, index, est, a, &act_nat_ops, bind, false); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } } else { return err; } p = to_tcf_nat(*a); spin_lock_bh(&p->tcf_lock); p->old_addr = parm->old_addr; p->new_addr = parm->new_addr; p->mask = parm->mask; p->flags = parm->flags; p->tcf_action = parm->action; spin_unlock_bh(&p->tcf_lock); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; } static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_nat *p = to_tcf_nat(a); struct iphdr *iph; __be32 old_addr; __be32 new_addr; __be32 mask; __be32 addr; int egress; int action; int ihl; int noff; spin_lock(&p->tcf_lock); tcf_lastuse_update(&p->tcf_tm); old_addr = p->old_addr; new_addr = p->new_addr; mask = p->mask; egress = p->flags & TCA_NAT_FLAG_EGRESS; action = p->tcf_action; bstats_update(&p->tcf_bstats, skb); spin_unlock(&p->tcf_lock); if (unlikely(action == TC_ACT_SHOT)) goto drop; noff = skb_network_offset(skb); if (!pskb_may_pull(skb, sizeof(*iph) + noff)) goto drop; iph = ip_hdr(skb); if (egress) addr = iph->saddr; else addr = iph->daddr; if (!((old_addr ^ addr) & mask)) { if (skb_try_make_writable(skb, sizeof(*iph) + noff)) goto drop; new_addr &= mask; new_addr |= addr & ~mask; /* Rewrite IP header */ iph = ip_hdr(skb); if (egress) iph->saddr = new_addr; else iph->daddr = new_addr; csum_replace4(&iph->check, addr, new_addr); } else if ((iph->frag_off & htons(IP_OFFSET)) || iph->protocol != IPPROTO_ICMP) { goto out; } ihl = iph->ihl * 4; /* It would be nice to share code with stateful NAT. */ switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { case IPPROTO_TCP: { struct tcphdr *tcph; if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) || skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff)) goto drop; tcph = (void *)(skb_network_header(skb) + ihl); inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); break; } case IPPROTO_UDP: { struct udphdr *udph; if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) || skb_try_make_writable(skb, ihl + sizeof(*udph) + noff)) goto drop; udph = (void *)(skb_network_header(skb) + ihl); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&udph->check, skb, addr, new_addr, true); if (!udph->check) udph->check = CSUM_MANGLED_0; } break; } case IPPROTO_ICMP: { struct icmphdr *icmph; if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff)) goto drop; icmph = (void *)(skb_network_header(skb) + ihl); if ((icmph->type != ICMP_DEST_UNREACH) && (icmph->type != ICMP_TIME_EXCEEDED) && (icmph->type != ICMP_PARAMETERPROB)) break; if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) + noff)) goto drop; icmph = (void *)(skb_network_header(skb) + ihl); iph = (void *)(icmph + 1); if (egress) addr = iph->daddr; else addr = iph->saddr; if ((old_addr ^ addr) & mask) break; if (skb_try_make_writable(skb, ihl + sizeof(*icmph) + sizeof(*iph) + noff)) goto drop; icmph = (void *)(skb_network_header(skb) + ihl); iph = (void *)(icmph + 1); new_addr &= mask; new_addr |= addr & ~mask; /* XXX Fix up the inner checksums. */ if (egress) iph->daddr = new_addr; else iph->saddr = new_addr; inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr, false); break; } default: break; } out: return action; drop: spin_lock(&p->tcf_lock); p->tcf_qstats.drops++; spin_unlock(&p->tcf_lock); return TC_ACT_SHOT; } static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_nat *p = to_tcf_nat(a); struct tc_nat opt = { .old_addr = p->old_addr, .new_addr = p->new_addr, .mask = p->mask, .flags = p->flags, .index = p->tcf_index, .action = p->tcf_action, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tcf_nat_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); return tcf_idr_search(tn, a, index); } static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, .owner = THIS_MODULE, .act = tcf_nat_act, .dump = tcf_nat_dump, .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, .size = sizeof(struct tcf_nat), }; static __net_init int nat_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, nat_net_id); return tc_action_net_init(net, tn, &act_nat_ops); } static void __net_exit nat_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, nat_net_id); } static struct pernet_operations nat_net_ops = { .init = nat_init_net, .exit_batch = nat_exit_net, .id = &nat_net_id, .size = sizeof(struct tc_action_net), }; MODULE_DESCRIPTION("Stateless NAT actions"); MODULE_LICENSE("GPL"); static int __init nat_init_module(void) { return tcf_register_action(&act_nat_ops, &nat_net_ops); } static void __exit nat_cleanup_module(void) { tcf_unregister_action(&act_nat_ops, &nat_net_ops); } module_init(nat_init_module); module_exit(nat_cleanup_module); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | // SPDX-License-Identifier: GPL-2.0 /* * consolidates trace point definitions * * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> */ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/rcupdate.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <linux/slab.h> #include <asm/unaligned.h> #include <asm/bitops.h> #define CREATE_TRACE_POINTS #include <trace/events/skb.h> #include <trace/events/net.h> #include <trace/events/napi.h> #include <trace/events/sock.h> #include <trace/events/udp.h> #include <trace/events/tcp.h> #include <trace/events/fib.h> #include <trace/events/qdisc.h> #if IS_ENABLED(CONFIG_BRIDGE) #include <trace/events/bridge.h> EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); #endif EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); |
8 36 16 36 2 36 36 30 69 69 69 5 69 9 1 9 6 23 9 8 8 7 15 1 11 11 11 9 8 3 4 1 4 3 4 2 2 1 1 2 2 2 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 | /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <linux/static_key.h> #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/busy_poll.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) return true; if (after(end_seq, s_win) && before(seq, e_win)) return true; return seq == e_win && seq == end_seq; } static enum tcp_tw_status tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, const struct sk_buff *skb, int mib_idx) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, &tcptw->tw_last_oow_ack_time)) { /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK; } /* We are rate-limiting, so just release the tw sock and drop skb. */ inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN * (and, probably, tail of data) and one or more our ACKs are lost. * * What is TIME-WAIT timeout? It is associated with maximal packet * lifetime in the internet, which results in wrong conclusion, that * it is set to catch "old duplicate segments" wandering out of their path. * It is not quite correct. This timeout is calculated so that it exceeds * maximal retransmission timeout enough to allow to lose one (or more) * segments sent by peer and our ACKs. This time may be calculated from RTO. * * When TIME-WAIT socket receives RST, it means that another end * finally closed and we are allowed to kill TIME-WAIT too. * * Second purpose of TIME-WAIT is catching old duplicate segments. * Well, certainly it is pure paranoia, but if we load TIME-WAIT * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. * * If we invented some more clever way to catch duplicates * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. * * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. * * NOTE. With recycling (and later with fin-wait-2) TW bucket * is _not_ stateless. It means, that strictly speaking we must * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK * * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } if (tw->tw_substate == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt, tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if (!paws_reject && (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } else { inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; TCP_SKB_CB(skb)->tcp_tw_isn = isn; return TCP_TW_SYN; } if (paws_reject) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack) inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } EXPORT_SYMBOL(tcp_timewait_state_process); /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct inet_timewait_sock *tw; struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; tw = inet_twsk_alloc(sk, tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); struct inet_sock *inet = inet_sk(sk); tw->tw_transparent = inet->transparent; tw->tw_mark = sk->sk_mark; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif #ifdef CONFIG_TCP_MD5SIG /* * The timewait bucket does not have the key DB from the * sock structure. We just make a quick copy of the * md5 key being used (if indeed we are using one) * so the timewait ack generating code has the key. */ do { struct tcp_md5sig_key *key; tcptw->tw_md5_key = NULL; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); } } while (0); #endif /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; /* tw_timer is pinned, so we need to make sure BH are disabled * in following section, otherwise timer handler could run before * we complete the initialization. */ local_bh_disable(); inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ inet_twsk_hashdance(tw, sk, &tcp_hashinfo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_key) kfree_rcu(twsk->tw_md5_key, rcu); #endif } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } EXPORT_SYMBOL(tcp_openreq_init_rwin); static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) { tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && try_module_get(ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; } rcu_read_unlock(); } /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || !try_module_get(icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); } EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(struct tcp_sock *oldtp, struct request_sock *req, struct tcp_sock *newtp) { #if IS_ENABLED(CONFIG_SMC) struct inet_request_sock *ireq; if (static_branch_unlikely(&tcp_have_smc)) { ireq = inet_rsk(req); if (oldtp->syn_smc && !ireq->smc_ok) newtp->syn_smc = 0; } #endif } /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; struct tcp_sock *oldtp, *newtp; u32 seq; if (!newsk) return NULL; newicsk = inet_csk(newsk); newtp = tcp_sk(newsk); oldtp = tcp_sk(sk); smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->packets_out = 0; newtp->retrans_out = 0; newtp->sacked_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; newtp->tlp_high_seq = 0; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = treq->txhash; newtp->last_oow_ack_time = 0; newtp->total_retrans = req->num_retrans; /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; /* There's a bubble in the pipe until at least the first ACK. */ newtp->app_limited = ~0U; tcp_init_xmit_timers(newsk); WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); newtp->rx_opt.saw_tstamp = 0; newtp->rx_opt.dsack = 0; newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; } else { newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req))) newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; newtp->rack.mstamp = 0; newtp->rack.advanced = 0; newtp->rack.reo_wnd_steps = 1; newtp->rack.last_delivered = 0; newtp->rack.reo_wnd_persist = 0; newtp->rack.dsack_seen = 0; __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented as a * request_sock. Normally sk is the listener socket but for TFO it * points to the child socket. * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *req_stolen) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { /* * RFC793 draws (Incorrectly! It was fixed in RFC1122) * this case on figure 6 and figure 8, but formal * protocol description says NOTHING. * To be more exact, it says that we should send ACK, * because this segment (at least, if it has no data) * is out of window. * * CONCLUSION: RFC793 (even with RFC1122) DOES NOT * describe SYN-RECV state. All the description * is wrong, we cannot believe to it and should * rely only on common sense and implementation * experience. * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. * * Note that even if there is new data in the SYN packet * they will be thrown away too. * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && !inet_rtx_syn_ack(sk, req)) { unsigned long expires = jiffies; expires += min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; } return NULL; } /* Further reproduces section "SEGMENT ARRIVES" for state SYN-RECEIVED of RFC793. It is broken, however, it does not work only when SYNs are crossed. You would think that SYN crossing is impossible here, since we should have a SYN_SENT socket (from connect()) on our end, but this is not true if the crossed SYNs were sent to both ends by a malicious third party. We must defend against this, and to do that we first verify the ACK (as per RFC793, page 36) and reset if it is invalid. Is this a true full defense? To convince ourselves, let us consider a way in which the ACK test can still pass in this 'malicious crossed SYNs' case. Malicious sender sends identical SYNs (and thus identical sequence numbers) to both A and B: A: gets SYN, seq=7 B: gets SYN, seq=7 By our good fortune, both A and B select the same initial send sequence number of seven :-) A: sends SYN|ACK, seq=7, ack_seq=8 B: sends SYN|ACK, seq=7, ack_seq=8 So we are now A eating this SYN|ACK, ACK test passes. So does sequence test, SYN is truncated, and thus we consider it a bare ACK. If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this bare ACK. Otherwise, we create an established connection. Both ends (listening sockets) accept the new incoming connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. But generally, we should (RFC lies!) to accept ACK from SYNACK both here and in tcp_rcv_state_process(). tcp_rcv_state_process() does not, hence, we do not too. Note that the case is absolutely generic: we cannot optimize anything here without violating protocol. All the checks must be made before attempt to create socket. */ /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket. * Note that the ACK validity check for a Fast Open socket is done * elsewhere and is checked directly against the child socket rather * than req because user data may have been sent out. */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which * is essentially ACK extension and too early or too late values * should cause reset in unsynchronized states. */ /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } /* In sequence, PAWS is OK. */ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) req->ts_recent = tmp_opt.rcv_tsval; if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); if (!child) goto listen_overflow; sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: if (!(flg & TCP_FLAG_RST)) { /* Received a bad SYN pkt - for TFO We try not to reset * the local connection unless it's really necessary to * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ req->rsk_ops->send_reset(sk, skb); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk); } if (!fastopen) { bool unlinked = inet_csk_reqsk_queue_drop(sk, req); if (unlinked) __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); *req_stolen = !unlinked; } return NULL; } EXPORT_SYMBOL(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. * * For the vast majority of cases child->sk_state will be TCP_SYN_RECV * when entering. But other states are possible due to a race condition * where after __inet_lookup_established() fails but before the listener * locked is obtained, other packets cause the same connection to * be created. */ int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) { int ret = 0; int state = child->sk_state; /* record NAPI ID of child */ sk_mark_napi_id(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return ret; } EXPORT_SYMBOL(tcp_child_process); |
679 681 842 841 323 54 51 54 1545 661 1545 1 1 628 96 627 604 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | /* * net/l3mdev/l3mdev.c - L3 master device implementation * Copyright (c) 2015 Cumulus Networks * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include <linux/netdevice.h> #include <net/fib_rules.h> #include <net/l3mdev.h> /** * l3mdev_master_ifindex - get index of L3 master device * @dev: targeted interface */ int l3mdev_master_ifindex_rcu(const struct net_device *dev) { int ifindex = 0; if (!dev) return 0; if (netif_is_l3_master(dev)) { ifindex = dev->ifindex; } else if (netif_is_l3_slave(dev)) { struct net_device *master; struct net_device *_dev = (struct net_device *)dev; /* netdev_master_upper_dev_get_rcu calls * list_first_or_null_rcu to walk the upper dev list. * list_first_or_null_rcu does not handle a const arg. We aren't * making changes, just want the master device from that list so * typecast to remove the const */ master = netdev_master_upper_dev_get_rcu(_dev); if (master) ifindex = master->ifindex; } return ifindex; } EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); /** * l3mdev_fib_table - get FIB table id associated with an L3 * master interface * @dev: targeted interface */ u32 l3mdev_fib_table_rcu(const struct net_device *dev) { u32 tb_id = 0; if (!dev) return 0; if (netif_is_l3_master(dev)) { if (dev->l3mdev_ops->l3mdev_fib_table) tb_id = dev->l3mdev_ops->l3mdev_fib_table(dev); } else if (netif_is_l3_slave(dev)) { /* Users of netdev_master_upper_dev_get_rcu need non-const, * but current inet_*type functions take a const */ struct net_device *_dev = (struct net_device *) dev; const struct net_device *master; master = netdev_master_upper_dev_get_rcu(_dev); if (master && master->l3mdev_ops->l3mdev_fib_table) tb_id = master->l3mdev_ops->l3mdev_fib_table(master); } return tb_id; } EXPORT_SYMBOL_GPL(l3mdev_fib_table_rcu); u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) { struct net_device *dev; u32 tb_id = 0; if (!ifindex) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) tb_id = l3mdev_fib_table_rcu(dev); rcu_read_unlock(); return tb_id; } EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); /** * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup */ struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; if (fl6->flowi6_oif) { rcu_read_lock(); dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (dev && netif_is_l3_slave(dev)) dev = netdev_master_upper_dev_get_rcu(dev); if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_link_scope_lookup) dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); rcu_read_unlock(); } return dst; } EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); /** * l3mdev_fib_rule_match - Determine if flowi references an * L3 master device * @net: network namespace for device index lookup * @fl: flow struct */ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg) { struct net_device *dev; int rc = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, fl->flowi_oif); if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_fib_table) { arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); rc = 1; goto out; } dev = dev_get_by_index_rcu(net, fl->flowi_iif); if (dev && netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_fib_table) { arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev); rc = 1; goto out; } out: rcu_read_unlock(); return rc; } void l3mdev_update_flow(struct net *net, struct flowi *fl) { struct net_device *dev; int ifindex; rcu_read_lock(); if (fl->flowi_oif) { dev = dev_get_by_index_rcu(net, fl->flowi_oif); if (dev) { ifindex = l3mdev_master_ifindex_rcu(dev); if (ifindex) { fl->flowi_oif = ifindex; fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; goto out; } } } if (fl->flowi_iif) { dev = dev_get_by_index_rcu(net, fl->flowi_iif); if (dev) { ifindex = l3mdev_master_ifindex_rcu(dev); if (ifindex) { fl->flowi_iif = ifindex; fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; } } } out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(l3mdev_update_flow); |
226 226 207 206 62 62 121 120 226 29 29 29 29 28 13 6 7 29 219 219 198 208 218 219 218 219 219 219 143 64 64 219 218 226 184 226 225 225 219 219 219 5 225 226 2 2 184 149 149 183 26 25 3 1 26 26 26 26 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/bfind.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Search routines for btrees */ #include <linux/slab.h> #include "hfsplus_fs.h" int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) { void *ptr; fd->tree = tree; fd->bnode = NULL; ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFSPLUS_CAT_CNID: mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); break; case HFSPLUS_EXT_CNID: mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); break; case HFSPLUS_ATTR_CNID: mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); break; default: BUG(); } return 0; } void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { __be32 cur_cnid; __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; search_cnid = fd->search_key->ext.cnid; } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { cur_cnid = fd->key->cat.parent; search_cnid = fd->search_key->cat.parent; } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; } else { cur_cnid = 0; /* used-uninitialized warning */ search_cnid = 0; BUG(); } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); if ((*begin) == (*end)) return 1; } else { if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) (*begin) = (*cur_rec) + 1; else (*end) = (*cur_rec) - 1; } return 0; } int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { int cmpval; cmpval = bnode->tree->keycmp(fd->key, fd->search_key); if (!cmpval) { (*end) = (*cur_rec); return 1; } if (cmpval < 0) (*begin) = (*cur_rec) + 1; else *(end) = (*cur_rec) - 1; return 0; } /* Find the record in bnode that best matches key (not greater than...)*/ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, search_strategy_t rec_found) { u16 off, len, keylen; int rec; int b, e; int res; BUG_ON(!rec_found); b = 0; e = bnode->num_recs - 1; res = -ENOENT; do { rec = (e + b) / 2; len = hfs_brec_lenoff(bnode, rec, &off); keylen = hfs_brec_keylen(bnode, rec); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); if (rec_found(bnode, fd, &b, &e, &rec)) { res = 0; goto done; } } while (b <= e); if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); } done: fd->record = e; fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; fail: return res; } /* Traverse a B*Tree from the root to a leaf finding best fit to key */ /* Return allocated copy of node found, set recnum to best record */ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) { struct hfs_btree *tree; struct hfs_bnode *bnode; u32 nidx, parent; __be32 data; int height, res; tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); fd->bnode = NULL; nidx = tree->root; if (!nidx) return -ENOENT; height = tree->depth; res = 0; parent = 0; for (;;) { bnode = hfs_bnode_find(tree, nidx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; break; } if (bnode->height != height) goto invalid; if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) goto invalid; bnode->parent = parent; res = __hfs_brec_find(bnode, fd, do_key_compare); if (!height) break; if (fd->record < 0) goto release; parent = nidx; hfs_bnode_read(bnode, &data, fd->entryoffset, 4); nidx = be32_to_cpu(data); hfs_bnode_put(bnode); } fd->bnode = bnode; return res; invalid: pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: hfs_bnode_put(bnode); return res; } int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) { int res; res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res) return res; if (fd->entrylength > rec_len) return -EINVAL; hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); return 0; } int hfs_brec_goto(struct hfs_find_data *fd, int cnt) { struct hfs_btree *tree; struct hfs_bnode *bnode; int idx, res = 0; u16 off, len, keylen; bnode = fd->bnode; tree = bnode->tree; if (cnt < 0) { cnt = -cnt; while (cnt > fd->record) { cnt -= fd->record + 1; fd->record = bnode->num_recs - 1; idx = bnode->prev; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record -= cnt; } else { while (cnt >= bnode->num_recs - fd->record) { cnt -= bnode->num_recs - fd->record; fd->record = 0; idx = bnode->next; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record += cnt; } len = hfs_brec_lenoff(bnode, fd->record, &off); keylen = hfs_brec_keylen(bnode, fd->record); if (keylen == 0) { res = -EINVAL; goto out; } fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; hfs_bnode_read(bnode, fd->key, off, keylen); out: fd->bnode = bnode; return res; } |
1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 | /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <drm/drmP.h> #include <drm/drm_crtc.h> #include <drm/drm_color_mgmt.h> #include "drm_crtc_internal.h" /** * DOC: overview * * Color management or color space adjustments is supported through a set of 5 * properties on the &drm_crtc object. They are set up by calling * drm_crtc_enable_color_mgmt(). * * "DEGAMMA_LUT”: * Blob property to set the degamma lookup table (LUT) mapping pixel data * from the framebuffer before it is given to the transformation matrix. * The data is interpreted as an array of &struct drm_color_lut elements. * Hardware might choose not to use the full precision of the LUT elements * nor use all the elements of the LUT (for example the hardware might * choose to interpolate between LUT[0] and LUT[4]). * * Setting this to NULL (blob property value set to 0) means a * linear/pass-thru gamma table should be used. This is generally the * driver boot-up state too. Drivers can access this blob through * &drm_crtc_state.degamma_lut. * * “DEGAMMA_LUT_SIZE”: * Unsinged range property to give the size of the lookup table to be set * on the DEGAMMA_LUT property (the size depends on the underlying * hardware). If drivers support multiple LUT sizes then they should * publish the largest size, and sub-sample smaller sized LUTs (e.g. for * split-gamma modes) appropriately. * * “CTM”: * Blob property to set the current transformation matrix (CTM) apply to * pixel data after the lookup through the degamma LUT and before the * lookup through the gamma LUT. The data is interpreted as a struct * &drm_color_ctm. * * Setting this to NULL (blob property value set to 0) means a * unit/pass-thru matrix should be used. This is generally the driver * boot-up state too. Drivers can access the blob for the color conversion * matrix through &drm_crtc_state.ctm. * * “GAMMA_LUT”: * Blob property to set the gamma lookup table (LUT) mapping pixel data * after the transformation matrix to data sent to the connector. The * data is interpreted as an array of &struct drm_color_lut elements. * Hardware might choose not to use the full precision of the LUT elements * nor use all the elements of the LUT (for example the hardware might * choose to interpolate between LUT[0] and LUT[4]). * * Setting this to NULL (blob property value set to 0) means a * linear/pass-thru gamma table should be used. This is generally the * driver boot-up state too. Drivers can access this blob through * &drm_crtc_state.gamma_lut. * * “GAMMA_LUT_SIZE”: * Unsigned range property to give the size of the lookup table to be set * on the GAMMA_LUT property (the size depends on the underlying hardware). * If drivers support multiple LUT sizes then they should publish the * largest size, and sub-sample smaller sized LUTs (e.g. for split-gamma * modes) appropriately. * * There is also support for a legacy gamma table, which is set up by calling * drm_mode_crtc_set_gamma_size(). Drivers which support both should use * drm_atomic_helper_legacy_gamma_set() to alias the legacy gamma ramp with the * "GAMMA_LUT" property above. * * Support for different non RGB color encodings is controlled through * &drm_plane specific COLOR_ENCODING and COLOR_RANGE properties. They * are set up by calling drm_plane_create_color_properties(). * * "COLOR_ENCODING" * Optional plane enum property to support different non RGB * color encodings. The driver can provide a subset of standard * enum values supported by the DRM plane. * * "COLOR_RANGE" * Optional plane enum property to support different non RGB * color parameter ranges. The driver can provide a subset of * standard enum values supported by the DRM plane. */ /** * drm_color_lut_extract - clamp and round LUT entries * @user_input: input value * @bit_precision: number of bits the hw LUT supports * * Extract a degamma/gamma LUT value provided by user (in the form of * &drm_color_lut entries) and round it to the precision supported by the * hardware. */ uint32_t drm_color_lut_extract(uint32_t user_input, uint32_t bit_precision) { uint32_t val = user_input; uint32_t max = 0xffff >> (16 - bit_precision); /* Round only if we're not using full precision. */ if (bit_precision < 16) { val += 1UL << (16 - bit_precision - 1); val >>= 16 - bit_precision; } return clamp_val(val, 0, max); } EXPORT_SYMBOL(drm_color_lut_extract); /** * drm_crtc_enable_color_mgmt - enable color management properties * @crtc: DRM CRTC * @degamma_lut_size: the size of the degamma lut (before CSC) * @has_ctm: whether to attach ctm_property for CSC matrix * @gamma_lut_size: the size of the gamma lut (after CSC) * * This function lets the driver enable the color correction * properties on a CRTC. This includes 3 degamma, csc and gamma * properties that userspace can set and 2 size properties to inform * the userspace of the lut sizes. Each of the properties are * optional. The gamma and degamma properties are only attached if * their size is not 0 and ctm_property is only attached if has_ctm is * true. * * Drivers should use drm_atomic_helper_legacy_gamma_set() to implement the * legacy &drm_crtc_funcs.gamma_set callback. */ void drm_crtc_enable_color_mgmt(struct drm_crtc *crtc, uint degamma_lut_size, bool has_ctm, uint gamma_lut_size) { struct drm_device *dev = crtc->dev; struct drm_mode_config *config = &dev->mode_config; if (degamma_lut_size) { drm_object_attach_property(&crtc->base, config->degamma_lut_property, 0); drm_object_attach_property(&crtc->base, config->degamma_lut_size_property, degamma_lut_size); } if (has_ctm) drm_object_attach_property(&crtc->base, config->ctm_property, 0); if (gamma_lut_size) { drm_object_attach_property(&crtc->base, config->gamma_lut_property, 0); drm_object_attach_property(&crtc->base, config->gamma_lut_size_property, gamma_lut_size); } } EXPORT_SYMBOL(drm_crtc_enable_color_mgmt); /** * drm_mode_crtc_set_gamma_size - set the gamma table size * @crtc: CRTC to set the gamma table size for * @gamma_size: size of the gamma table * * Drivers which support gamma tables should set this to the supported gamma * table size when initializing the CRTC. Currently the drm core only supports a * fixed gamma table size. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_crtc_set_gamma_size(struct drm_crtc *crtc, int gamma_size) { uint16_t *r_base, *g_base, *b_base; int i; crtc->gamma_size = gamma_size; crtc->gamma_store = kcalloc(gamma_size, sizeof(uint16_t) * 3, GFP_KERNEL); if (!crtc->gamma_store) { crtc->gamma_size = 0; return -ENOMEM; } r_base = crtc->gamma_store; g_base = r_base + gamma_size; b_base = g_base + gamma_size; for (i = 0; i < gamma_size; i++) { r_base[i] = i << 8; g_base[i] = i << 8; b_base[i] = i << 8; } return 0; } EXPORT_SYMBOL(drm_mode_crtc_set_gamma_size); /** * drm_mode_gamma_set_ioctl - set the gamma table * @dev: DRM device * @data: ioctl data * @file_priv: DRM file info * * Set the gamma table of a CRTC to the one passed in by the user. Userspace can * inquire the required gamma table size through drm_mode_gamma_get_ioctl. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_gamma_set_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_crtc_lut *crtc_lut = data; struct drm_crtc *crtc; void *r_base, *g_base, *b_base; int size; struct drm_modeset_acquire_ctx ctx; int ret = 0; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EINVAL; crtc = drm_crtc_find(dev, file_priv, crtc_lut->crtc_id); if (!crtc) return -ENOENT; if (crtc->funcs->gamma_set == NULL) return -ENOSYS; /* memcpy into gamma store */ if (crtc_lut->gamma_size != crtc->gamma_size) return -EINVAL; drm_modeset_acquire_init(&ctx, 0); retry: ret = drm_modeset_lock_all_ctx(dev, &ctx); if (ret) goto out; size = crtc_lut->gamma_size * (sizeof(uint16_t)); r_base = crtc->gamma_store; if (copy_from_user(r_base, (void __user *)(unsigned long)crtc_lut->red, size)) { ret = -EFAULT; goto out; } g_base = r_base + size; if (copy_from_user(g_base, (void __user *)(unsigned long)crtc_lut->green, size)) { ret = -EFAULT; goto out; } b_base = g_base + size; if (copy_from_user(b_base, (void __user *)(unsigned long)crtc_lut->blue, size)) { ret = -EFAULT; goto out; } ret = crtc->funcs->gamma_set(crtc, r_base, g_base, b_base, crtc->gamma_size, &ctx); out: if (ret == -EDEADLK) { drm_modeset_backoff(&ctx); goto retry; } drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; } /** * drm_mode_gamma_get_ioctl - get the gamma table * @dev: DRM device * @data: ioctl data * @file_priv: DRM file info * * Copy the current gamma table into the storage provided. This also provides * the gamma table size the driver expects, which can be used to size the * allocated storage. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_gamma_get_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_crtc_lut *crtc_lut = data; struct drm_crtc *crtc; void *r_base, *g_base, *b_base; int size; int ret = 0; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EINVAL; crtc = drm_crtc_find(dev, file_priv, crtc_lut->crtc_id); if (!crtc) return -ENOENT; /* memcpy into gamma store */ if (crtc_lut->gamma_size != crtc->gamma_size) return -EINVAL; drm_modeset_lock(&crtc->mutex, NULL); size = crtc_lut->gamma_size * (sizeof(uint16_t)); r_base = crtc->gamma_store; if (copy_to_user((void __user *)(unsigned long)crtc_lut->red, r_base, size)) { ret = -EFAULT; goto out; } g_base = r_base + size; if (copy_to_user((void __user *)(unsigned long)crtc_lut->green, g_base, size)) { ret = -EFAULT; goto out; } b_base = g_base + size; if (copy_to_user((void __user *)(unsigned long)crtc_lut->blue, b_base, size)) { ret = -EFAULT; goto out; } out: drm_modeset_unlock(&crtc->mutex); return ret; } static const char * const color_encoding_name[] = { [DRM_COLOR_YCBCR_BT601] = "ITU-R BT.601 YCbCr", [DRM_COLOR_YCBCR_BT709] = "ITU-R BT.709 YCbCr", [DRM_COLOR_YCBCR_BT2020] = "ITU-R BT.2020 YCbCr", }; static const char * const color_range_name[] = { [DRM_COLOR_YCBCR_FULL_RANGE] = "YCbCr full range", [DRM_COLOR_YCBCR_LIMITED_RANGE] = "YCbCr limited range", }; /** * drm_get_color_encoding_name - return a string for color encoding * @encoding: color encoding to compute name of * * In contrast to the other drm_get_*_name functions this one here returns a * const pointer and hence is threadsafe. */ const char *drm_get_color_encoding_name(enum drm_color_encoding encoding) { if (WARN_ON(encoding >= ARRAY_SIZE(color_encoding_name))) return "unknown"; return color_encoding_name[encoding]; } /** * drm_get_color_range_name - return a string for color range * @range: color range to compute name of * * In contrast to the other drm_get_*_name functions this one here returns a * const pointer and hence is threadsafe. */ const char *drm_get_color_range_name(enum drm_color_range range) { if (WARN_ON(range >= ARRAY_SIZE(color_range_name))) return "unknown"; return color_range_name[range]; } /** * drm_plane_create_color_properties - color encoding related plane properties * @plane: plane object * @supported_encodings: bitfield indicating supported color encodings * @supported_ranges: bitfileld indicating supported color ranges * @default_encoding: default color encoding * @default_range: default color range * * Create and attach plane specific COLOR_ENCODING and COLOR_RANGE * properties to @plane. The supported encodings and ranges should * be provided in supported_encodings and supported_ranges bitmasks. * Each bit set in the bitmask indicates that its number as enum * value is supported. */ int drm_plane_create_color_properties(struct drm_plane *plane, u32 supported_encodings, u32 supported_ranges, enum drm_color_encoding default_encoding, enum drm_color_range default_range) { struct drm_device *dev = plane->dev; struct drm_property *prop; struct drm_prop_enum_list enum_list[max_t(int, DRM_COLOR_ENCODING_MAX, DRM_COLOR_RANGE_MAX)]; int i, len; if (WARN_ON(supported_encodings == 0 || (supported_encodings & -BIT(DRM_COLOR_ENCODING_MAX)) != 0 || (supported_encodings & BIT(default_encoding)) == 0)) return -EINVAL; if (WARN_ON(supported_ranges == 0 || (supported_ranges & -BIT(DRM_COLOR_RANGE_MAX)) != 0 || (supported_ranges & BIT(default_range)) == 0)) return -EINVAL; len = 0; for (i = 0; i < DRM_COLOR_ENCODING_MAX; i++) { if ((supported_encodings & BIT(i)) == 0) continue; enum_list[len].type = i; enum_list[len].name = color_encoding_name[i]; len++; } prop = drm_property_create_enum(dev, 0, "COLOR_ENCODING", enum_list, len); if (!prop) return -ENOMEM; plane->color_encoding_property = prop; drm_object_attach_property(&plane->base, prop, default_encoding); if (plane->state) plane->state->color_encoding = default_encoding; len = 0; for (i = 0; i < DRM_COLOR_RANGE_MAX; i++) { if ((supported_ranges & BIT(i)) == 0) continue; enum_list[len].type = i; enum_list[len].name = color_range_name[i]; len++; } prop = drm_property_create_enum(dev, 0, "COLOR_RANGE", enum_list, len); if (!prop) return -ENOMEM; plane->color_range_property = prop; drm_object_attach_property(&plane->base, prop, default_range); if (plane->state) plane->state->color_range = default_range; return 0; } EXPORT_SYMBOL(drm_plane_create_color_properties); |
4 4 4 4 4 4 4 4 4 35 4 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | /* * drivers/base/power/trace.c * * Copyright (C) 2006 Linus Torvalds * * Trace facility for suspend/resume problems, when none of the * devices may be working. */ #include <linux/pm-trace.h> #include <linux/export.h> #include <linux/rtc.h> #include <linux/suspend.h> #include <linux/init.h> #include <linux/mc146818rtc.h> #include "power.h" /* * Horrid, horrid, horrid. * * It turns out that the _only_ piece of hardware that actually * keeps its value across a hard boot (and, more importantly, the * POST init sequence) is literally the realtime clock. * * Never mind that an RTC chip has 114 bytes (and often a whole * other bank of an additional 128 bytes) of nice SRAM that is * _designed_ to keep data - the POST will clear it. So we literally * can just use the few bytes of actual time data, which means that * we're really limited. * * It means, for example, that we can't use the seconds at all * (since the time between the hang and the boot might be more * than a minute), and we'd better not depend on the low bits of * the minutes either. * * There are the wday fields etc, but I wouldn't guarantee those * are dependable either. And if the date isn't valid, either the * hw or POST will do strange things. * * So we're left with: * - year: 0-99 * - month: 0-11 * - day-of-month: 1-28 * - hour: 0-23 * - min: (0-30)*2 * * Giving us a total range of 0-16128000 (0xf61800), ie less * than 24 bits of actual data we can save across reboots. * * And if your box can't boot in less than three minutes, * you're screwed. * * Now, almost 24 bits of data is pitifully small, so we need * to be pretty dense if we want to use it for anything nice. * What we do is that instead of saving off nice readable info, * we save off _hashes_ of information that we can hopefully * regenerate after the reboot. * * In particular, this means that we might be unlucky, and hit * a case where we have a hash collision, and we end up not * being able to tell for certain exactly which case happened. * But that's hopefully unlikely. * * What we do is to take the bits we can fit, and split them * into three parts (16*997*1009 = 16095568), and use the values * for: * - 0-15: user-settable * - 0-996: file + line number * - 0-1008: device */ #define USERHASH (16) #define FILEHASH (997) #define DEVHASH (1009) #define DEVSEED (7919) bool pm_trace_rtc_abused __read_mostly; EXPORT_SYMBOL_GPL(pm_trace_rtc_abused); static unsigned int dev_hash_value; static int set_magic_time(unsigned int user, unsigned int file, unsigned int device) { unsigned int n = user + USERHASH*(file + FILEHASH*device); // June 7th, 2006 static struct rtc_time time = { .tm_sec = 0, .tm_min = 0, .tm_hour = 0, .tm_mday = 7, .tm_mon = 5, // June - counting from zero .tm_year = 106, .tm_wday = 3, .tm_yday = 160, .tm_isdst = 1 }; time.tm_year = (n % 100); n /= 100; time.tm_mon = (n % 12); n /= 12; time.tm_mday = (n % 28) + 1; n /= 28; time.tm_hour = (n % 24); n /= 24; time.tm_min = (n % 20) * 3; n /= 20; mc146818_set_time(&time); pm_trace_rtc_abused = true; return n ? -1 : 0; } static unsigned int read_magic_time(void) { struct rtc_time time; unsigned int val; mc146818_get_time(&time); pr_info("RTC time: %2d:%02d:%02d, date: %02d/%02d/%02d\n", time.tm_hour, time.tm_min, time.tm_sec, time.tm_mon + 1, time.tm_mday, time.tm_year % 100); val = time.tm_year; /* 100 years */ if (val > 100) val -= 100; val += time.tm_mon * 100; /* 12 months */ val += (time.tm_mday-1) * 100 * 12; /* 28 month-days */ val += time.tm_hour * 100 * 12 * 28; /* 24 hours */ val += (time.tm_min / 3) * 100 * 12 * 28 * 24; /* 20 3-minute intervals */ return val; } /* * This is just the sdbm hash function with a user-supplied * seed and final size parameter. */ static unsigned int hash_string(unsigned int seed, const char *data, unsigned int mod) { unsigned char c; while ((c = *data++) != 0) { seed = (seed << 16) + (seed << 6) - seed + c; } return seed % mod; } void set_trace_device(struct device *dev) { dev_hash_value = hash_string(DEVSEED, dev_name(dev), DEVHASH); } EXPORT_SYMBOL(set_trace_device); /* * We could just take the "tracedata" index into the .tracedata * section instead. Generating a hash of the data gives us a * chance to work across kernel versions, and perhaps more * importantly it also gives us valid/invalid check (ie we will * likely not give totally bogus reports - if the hash matches, * it's not any guarantee, but it's a high _likelihood_ that * the match is valid). */ void generate_pm_trace(const void *tracedata, unsigned int user) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); unsigned int user_hash_value, file_hash_value; if (!x86_platform.legacy.rtc) return; user_hash_value = user % USERHASH; file_hash_value = hash_string(lineno, file, FILEHASH); set_magic_time(user_hash_value, file_hash_value, dev_hash_value); } EXPORT_SYMBOL(generate_pm_trace); extern char __tracedata_start[], __tracedata_end[]; static int show_file_hash(unsigned int value) { int match; char *tracedata; match = 0; for (tracedata = __tracedata_start ; tracedata < __tracedata_end ; tracedata += 2 + sizeof(unsigned long)) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); unsigned int hash = hash_string(lineno, file, FILEHASH); if (hash != value) continue; pr_info(" hash matches %s:%u\n", file, lineno); match++; } return match; } static int show_dev_hash(unsigned int value) { int match = 0; struct list_head *entry; device_pm_lock(); entry = dpm_list.prev; while (entry != &dpm_list) { struct device * dev = to_device(entry); unsigned int hash = hash_string(DEVSEED, dev_name(dev), DEVHASH); if (hash == value) { dev_info(dev, "hash matches\n"); match++; } entry = entry->prev; } device_pm_unlock(); return match; } static unsigned int hash_value_early_read; int show_trace_dev_match(char *buf, size_t size) { unsigned int value = hash_value_early_read / (USERHASH * FILEHASH); int ret = 0; struct list_head *entry; /* * It's possible that multiple devices will match the hash and we can't * tell which is the culprit, so it's best to output them all. */ device_pm_lock(); entry = dpm_list.prev; while (size && entry != &dpm_list) { struct device *dev = to_device(entry); unsigned int hash = hash_string(DEVSEED, dev_name(dev), DEVHASH); if (hash == value) { int len = snprintf(buf, size, "%s\n", dev_driver_string(dev)); if (len > size) len = size; buf += len; ret += len; size -= len; } entry = entry->prev; } device_pm_unlock(); return ret; } static int pm_trace_notify(struct notifier_block *nb, unsigned long mode, void *_unused) { switch (mode) { case PM_POST_HIBERNATION: case PM_POST_SUSPEND: if (pm_trace_rtc_abused) { pm_trace_rtc_abused = false; pr_warn("Possible incorrect RTC due to pm_trace, please use 'ntpdate' or 'rdate' to reset it.\n"); } break; default: break; } return 0; } static struct notifier_block pm_trace_nb = { .notifier_call = pm_trace_notify, }; static int early_resume_init(void) { if (!x86_platform.legacy.rtc) return 0; hash_value_early_read = read_magic_time(); register_pm_notifier(&pm_trace_nb); return 0; } static int late_resume_init(void) { unsigned int val = hash_value_early_read; unsigned int user, file, dev; if (!x86_platform.legacy.rtc) return 0; user = val % USERHASH; val = val / USERHASH; file = val % FILEHASH; val = val / FILEHASH; dev = val /* % DEVHASH */; pr_info(" Magic number: %d:%d:%d\n", user, file, dev); show_file_hash(file); show_dev_hash(dev); return 0; } core_initcall(early_resume_init); late_initcall(late_resume_init); |
22 30 126 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef __KVM_IODEV_H__ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> #include <linux/errno.h> struct kvm_io_device; struct kvm_vcpu; /** * kvm_io_device_ops are called under kvm slots_lock. * read and write handlers return 0 if the transaction has been handled, * or non-zero to have it passed to the next device. **/ struct kvm_io_device_ops { int (*read)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *val); int (*write)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val); void (*destructor)(struct kvm_io_device *this); }; struct kvm_io_device { const struct kvm_io_device_ops *ops; }; static inline void kvm_iodevice_init(struct kvm_io_device *dev, const struct kvm_io_device_ops *ops) { dev->ops = ops; } static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, void *v) { return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, const void *v) { return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) { if (dev->ops->destructor) dev->ops->destructor(dev); } #endif /* __KVM_IODEV_H__ */ |
10 10 10 12 2 2 2 2 2 2 2 12 11 2 2 2 12 10 10 10 2 12 12 12 17 17 10 10 8 2 2 10 10 10 10 10 10 10 31 1 9 9 9 9 9 9 9 9 17 17 9 4 3 3 4 4 3 13 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 | /* * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> * * An implementation of the DCCP protocol * * This code has been developed by the University of Waikato WAND * research group. For further information please see http://www.wand.net.nz/ * * This code also uses code from Lulea University, rereleased as GPL by its * authors: * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon * * Changes to meet Linux coding standards, to make it meet latest ccid3 draft * and to make it work as a loadable module in the DCCP stack written by * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. * * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "../dccp.h" #include "ccid3.h" #include <asm/unaligned.h> #ifdef CONFIG_IP_DCCP_CCID3_DEBUG static bool ccid3_debug; #define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) #else #define ccid3_pr_debug(format, a...) #endif /* * Transmitter Half-Connection Routines */ #ifdef CONFIG_IP_DCCP_CCID3_DEBUG static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) { static const char *const ccid3_state_names[] = { [TFRC_SSTATE_NO_SENT] = "NO_SENT", [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", [TFRC_SSTATE_FBACK] = "FBACK", }; return ccid3_state_names[state]; } #endif static void ccid3_hc_tx_set_state(struct sock *sk, enum ccid3_hc_tx_states state) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); enum ccid3_hc_tx_states oldstate = hc->tx_state; ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", dccp_role(sk), sk, ccid3_tx_state_name(oldstate), ccid3_tx_state_name(state)); WARN_ON(state == oldstate); hc->tx_state = state; } /* * Compute the initial sending rate X_init in the manner of RFC 3390: * * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT * * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. * For consistency with other parts of the code, X_init is scaled by 2^6. */ static inline u64 rfc3390_initial_rate(struct sock *sk) { const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s); return scaled_div(w_init << 6, hc->tx_rtt); } /** * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst * This respects the granularity of X_inst (64 * bytes/second). */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) { hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); DCCP_BUG_ON(hc->tx_t_ipi == 0); ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, hc->tx_s, (unsigned int)(hc->tx_x >> 6)); } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) { u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count); return delta / hc->tx_rtt; } /** * ccid3_hc_tx_update_x - Update allowed sending rate X * @stamp: most recent time if available - can be left NULL. * * This function tracks draft rfc3448bis, check there for latest details. * * Note: X and X_recv are both stored in units of 64 * bytes/second, to support * fine-grained resolution of sending rates. This requires scaling by 2^6 * throughout the code. Only X_calc is unscaled (in bytes/second). * */ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); __u64 min_rate = 2 * hc->tx_x_recv; const __u64 old_x = hc->tx_x; ktime_t now = stamp ? *stamp : ktime_get_real(); /* * Handle IDLE periods: do not reduce below RFC3390 initial sending rate * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis: * a sender is idle if it has not sent anything over a 2-RTT-period. * For consistency with X and X_recv, min_rate is also scaled by 2^6. */ if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) { min_rate = rfc3390_initial_rate(sk); min_rate = max(min_rate, 2 * hc->tx_x_recv); } if (hc->tx_p > 0) { hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate); hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) { hc->tx_x = min(2 * hc->tx_x, min_rate); hc->tx_x = max(hc->tx_x, scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt)); hc->tx_t_ld = now; } if (hc->tx_x != old_x) { ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " "X_recv=%u\n", (unsigned int)(old_x >> 6), (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc, (unsigned int)(hc->tx_x_recv >> 6)); ccid3_update_send_interval(hc); } } /** * ccid3_hc_tx_update_s - Track the mean packet size `s' * @len: DCCP packet payload size in bytes * * cf. RFC 4342, 5.3 and RFC 3448, 4.1 */ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len) { const u16 old_s = hc->tx_s; hc->tx_s = tfrc_ewma(hc->tx_s, len, 9); if (hc->tx_s != old_s) ccid3_update_send_interval(hc); } /* * Update Window Counter using the algorithm from [RFC 4342, 8.1]. * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt(). */ static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc, ktime_t now) { u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count), quarter_rtts = (4 * delta) / hc->tx_rtt; if (quarter_rtts > 0) { hc->tx_t_last_win_count = now; hc->tx_last_win_count += min(quarter_rtts, 5U); hc->tx_last_win_count &= 0xF; /* mod 16 */ } } static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t) { struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer); struct sock *sk = hc->sk; unsigned long t_nfb = USEC_PER_SEC / 5; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ /* XXX: set some sensible MIB */ goto restart_timer; } ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, ccid3_tx_state_name(hc->tx_state)); /* Ignore and do not restart after leaving the established state */ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) goto out; /* Reset feedback state to "no feedback received" */ if (hc->tx_state == TFRC_SSTATE_FBACK) ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 * RTO is 0 if and only if no feedback has been received yet. */ if (hc->tx_t_rto == 0 || hc->tx_p == 0) { /* halve send rate directly */ hc->tx_x = max(hc->tx_x / 2, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); ccid3_update_send_interval(hc); } else { /* * Modify the cached value of X_recv * * If (X_calc > 2 * X_recv) * X_recv = max(X_recv / 2, s / (2 * t_mbi)); * Else * X_recv = X_calc / 4; * * Note that X_recv is scaled by 2^6 while X_calc is not */ if (hc->tx_x_calc > (hc->tx_x_recv >> 5)) hc->tx_x_recv = max(hc->tx_x_recv / 2, (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI)); else { hc->tx_x_recv = hc->tx_x_calc; hc->tx_x_recv <<= 4; } ccid3_hc_tx_update_x(sk, NULL); } ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", (unsigned long long)hc->tx_x); /* * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); restart_timer: sk_reset_timer(sk, &hc->tx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); out: bh_unlock_sock(sk); sock_put(sk); } /** * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets * @skb: next packet candidate to send on @sk * * This function uses the convention of ccid_packet_dequeue_eval() and * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. */ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); ktime_t now = ktime_get_real(); s64 delay; /* * This function is called only for Data and DataAck packets. Sending * zero-sized Data(Ack)s is theoretically possible, but for congestion * control this case is pathological - ignore it. */ if (unlikely(skb->len == 0)) return -EBADMSG; if (hc->tx_state == TFRC_SSTATE_NO_SENT) { sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); hc->tx_last_win_count = 0; hc->tx_t_last_win_count = now; /* Set t_0 for initial packet */ hc->tx_t_nom = now; hc->tx_s = skb->len; /* * Use initial RTT sample when available: recommended by erratum * to RFC 4342. This implements the initialisation procedure of * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6. */ if (dp->dccps_syn_rtt) { ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); hc->tx_rtt = dp->dccps_syn_rtt; hc->tx_x = rfc3390_initial_rate(sk); hc->tx_t_ld = now; } else { /* * Sender does not have RTT sample: * - set fallback RTT (RFC 4340, 3.4) since a RTT value * is needed in several parts (e.g. window counter); * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. */ hc->tx_rtt = DCCP_FALLBACK_RTT; hc->tx_x = hc->tx_s; hc->tx_x <<= 6; } ccid3_update_send_interval(hc); ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); } else { delay = ktime_us_delta(hc->tx_t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* * Scheduling of packet transmissions (RFC 5348, 8.3) * * if (t_now > t_nom - delta) * // send the packet now * else * // send the packet in (t_nom - t_now) milliseconds. */ if (delay >= TFRC_T_DELTA) return (u32)delay / USEC_PER_MSEC; ccid3_hc_tx_update_win_count(hc, now); } /* prepare to send now (add options etc.) */ dp->dccps_hc_tx_insert_options = 1; DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count; /* set the nominal send time for the next following packet */ hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); return CCID_PACKET_SEND_AT_ONCE; } static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); ccid3_hc_tx_update_s(hc, len); if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss)) DCCP_CRIT("packet history - out of memory!"); } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct tfrc_tx_hist_entry *acked; ktime_t now; unsigned long t_nfb; u32 r_sample; /* we are only interested in ACKs */ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; /* * Locate the acknowledged packet in the TX history. * * Returning "entry not found" here can for instance happen when * - the host has not sent out anything (e.g. a passive server), * - the Ack is outdated (packet with higher Ack number was received), * - it is a bogus Ack (for a packet not sent on this connection). */ acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); if (acked == NULL) return; /* For the sake of RTT sampling, ignore/remove all older entries */ tfrc_tx_hist_purge(&acked->next); /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ now = ktime_get_real(); r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ if (hc->tx_state == TFRC_SSTATE_NO_FBACK) { ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); if (hc->tx_t_rto == 0) { /* * Initial feedback packet: Larger Initial Windows (4.2) */ hc->tx_x = rfc3390_initial_rate(sk); hc->tx_t_ld = now; ccid3_update_send_interval(hc); goto done_computing_x; } else if (hc->tx_p == 0) { /* * First feedback after nofeedback timer expiry (4.3) */ goto done_computing_x; } } /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ if (hc->tx_p > 0) hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p); ccid3_hc_tx_update_x(sk, &now); done_computing_x: ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " "p=%u, X_calc=%u, X_recv=%u, X=%u\n", dccp_role(sk), sk, hc->tx_rtt, r_sample, hc->tx_s, hc->tx_p, hc->tx_x_calc, (unsigned int)(hc->tx_x_recv >> 6), (unsigned int)(hc->tx_x >> 6)); /* unschedule no feedback timer */ sk_stop_timer(sk, &hc->tx_no_feedback_timer); /* * As we have calculated new ipi, delta, t_nom it is possible * that we now can send a packet, so wake up dccp_wait_for_ccid */ sk->sk_write_space(sk); /* * Update timeout interval for the nofeedback timer. In order to control * rate halving on networks with very low RTTs (<= 1 ms), use per-route * tunable RTAX_RTO_MIN value as the lower bound. */ hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, USEC_PER_SEC/HZ * tcp_rto_min(sk)); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) */ t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " "expire in %lu jiffies (%luus)\n", dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); sk_reset_timer(sk, &hc->tx_no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); } static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, u8 option, u8 *optval, u8 optlen) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); __be32 opt_val; switch (option) { case TFRC_OPT_RECEIVE_RATE: case TFRC_OPT_LOSS_EVENT_RATE: /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ if (packet_type == DCCP_PKT_DATA) break; if (unlikely(optlen != 4)) { DCCP_WARN("%s(%p), invalid len %d for %u\n", dccp_role(sk), sk, optlen, option); return -EINVAL; } opt_val = ntohl(get_unaligned((__be32 *)optval)); if (option == TFRC_OPT_RECEIVE_RATE) { /* Receive Rate is kept in units of 64 bytes/second */ hc->tx_x_recv = opt_val; hc->tx_x_recv <<= 6; ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", dccp_role(sk), sk, opt_val); } else { /* Update the fixpoint Loss Event Rate fraction */ hc->tx_p = tfrc_invert_loss_event_rate(opt_val); ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", dccp_role(sk), sk, opt_val); } } return 0; } static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_tx_sock *hc = ccid_priv(ccid); hc->tx_state = TFRC_SSTATE_NO_SENT; hc->tx_hist = NULL; hc->sk = sk; timer_setup(&hc->tx_no_feedback_timer, ccid3_hc_tx_no_feedback_timer, 0); return 0; } static void ccid3_hc_tx_exit(struct sock *sk) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); sk_stop_timer(sk, &hc->tx_no_feedback_timer); tfrc_tx_hist_purge(&hc->tx_hist); } static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct tfrc_tx_info tfrc; const void *val; switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: if (len < sizeof(tfrc)) return -EINVAL; memset(&tfrc, 0, sizeof(tfrc)); tfrc.tfrctx_x = hc->tx_x; tfrc.tfrctx_x_recv = hc->tx_x_recv; tfrc.tfrctx_x_calc = hc->tx_x_calc; tfrc.tfrctx_rtt = hc->tx_rtt; tfrc.tfrctx_p = hc->tx_p; tfrc.tfrctx_rto = hc->tx_t_rto; tfrc.tfrctx_ipi = hc->tx_t_ipi; len = sizeof(tfrc); val = &tfrc; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen) || copy_to_user(optval, val, len)) return -EFAULT; return 0; } /* * Receiver Half-Connection Routines */ /* CCID3 feedback types */ enum ccid3_fback_type { CCID3_FBACK_NONE = 0, CCID3_FBACK_INITIAL, CCID3_FBACK_PERIODIC, CCID3_FBACK_PARAM_CHANGE }; #ifdef CONFIG_IP_DCCP_CCID3_DEBUG static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) { static const char *const ccid3_rx_state_names[] = { [TFRC_RSTATE_NO_DATA] = "NO_DATA", [TFRC_RSTATE_DATA] = "DATA", }; return ccid3_rx_state_names[state]; } #endif static void ccid3_hc_rx_set_state(struct sock *sk, enum ccid3_hc_rx_states state) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); enum ccid3_hc_rx_states oldstate = hc->rx_state; ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", dccp_role(sk), sk, ccid3_rx_state_name(oldstate), ccid3_rx_state_name(state)); WARN_ON(state == oldstate); hc->rx_state = state; } static void ccid3_hc_rx_send_feedback(struct sock *sk, const struct sk_buff *skb, enum ccid3_fback_type fbtype) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); ktime_t now = ktime_get(); s64 delta = 0; switch (fbtype) { case CCID3_FBACK_INITIAL: hc->rx_x_recv = 0; hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */ break; case CCID3_FBACK_PARAM_CHANGE: /* * When parameters change (new loss or p > p_prev), we do not * have a reliable estimate for R_m of [RFC 3448, 6.2] and so * need to reuse the previous value of X_recv. However, when * X_recv was 0 (due to early loss), this would kill X down to * s/t_mbi (i.e. one packet in 64 seconds). * To avoid such drastic reduction, we approximate X_recv as * the number of bytes since last feedback. * This is a safe fallback, since X is bounded above by X_calc. */ if (hc->rx_x_recv > 0) break; /* fall through */ case CCID3_FBACK_PERIODIC: delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); if (delta <= 0) delta = 1; hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); break; default: return; } ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta, hc->rx_x_recv, hc->rx_pinv); hc->rx_tstamp_last_feedback = now; hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval; hc->rx_bytes_recv = 0; dp->dccps_hc_rx_insert_options = 1; dccp_send_ack(sk); } static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; if (dccp_packet_without_ack(skb)) return 0; x_recv = htonl(hc->rx_x_recv); pinv = htonl(hc->rx_pinv); if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)) || dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE, &x_recv, sizeof(x_recv))) return -1; return 0; } /** * ccid3_first_li - Implements [RFC 5348, 6.3.1] * * Determine the length of the first loss interval via inverse lookup. * Assume that X_recv can be computed by the throughput equation * s * X_recv = -------- * R * fval * Find some p such that f(p) = fval; return 1/p (scaled). */ static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); u32 x_recv, p; s64 delta; u64 fval; if (hc->rx_rtt == 0) { DCCP_WARN("No RTT estimate available, using fallback RTT\n"); hc->rx_rtt = DCCP_FALLBACK_RTT; } delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback); if (delta <= 0) delta = 1; x_recv = scaled_div32(hc->rx_bytes_recv, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); if (hc->rx_x_recv == 0) { DCCP_BUG("stored value of X_recv is zero"); return ~0U; } x_recv = hc->rx_x_recv; } fval = scaled_div(hc->rx_s, hc->rx_rtt); fval = scaled_div32(fval, x_recv); p = tfrc_calc_x_reverse_lookup(fval); ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); return p == 0 ? ~0U : scaled_div(1, p); } static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; const bool is_data_packet = dccp_data_packet(skb); if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) { if (is_data_packet) { const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; do_feedback = CCID3_FBACK_INITIAL; ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); hc->rx_s = payload; /* * Not necessary to update rx_bytes_recv here, * since X_recv = 0 for the first feedback packet (cf. * RFC 3448, 6.3) -- gerrit */ } goto update_records; } if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb)) return; /* done receiving */ if (is_data_packet) { const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; /* * Update moving-average of s and the sum of received payload bytes */ hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9); hc->rx_bytes_recv += payload; } /* * Perform loss detection and handle pending losses */ if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist, skb, ndp, ccid3_first_li, sk)) { do_feedback = CCID3_FBACK_PARAM_CHANGE; goto done_receiving; } if (tfrc_rx_hist_loss_pending(&hc->rx_hist)) return; /* done receiving */ /* * Handle data packets: RTT sampling and monitoring p */ if (unlikely(!is_data_packet)) goto update_records; if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) { const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb); /* * Empty loss history: no loss so far, hence p stays 0. * Sample RTT values, since an RTT estimate is required for the * computation of p when the first loss occurs; RFC 3448, 6.3.1. */ if (sample != 0) hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9); } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) { /* * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean * has decreased (resp. p has increased), send feedback now. */ do_feedback = CCID3_FBACK_PARAM_CHANGE; } /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3) do_feedback = CCID3_FBACK_PERIODIC; update_records: tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp); done_receiving: if (do_feedback) ccid3_hc_rx_send_feedback(sk, skb, do_feedback); } static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_rx_sock *hc = ccid_priv(ccid); hc->rx_state = TFRC_RSTATE_NO_DATA; tfrc_lh_init(&hc->rx_li_hist); return tfrc_rx_hist_alloc(&hc->rx_hist); } static void ccid3_hc_rx_exit(struct sock *sk) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); tfrc_rx_hist_purge(&hc->rx_hist); tfrc_lh_cleanup(&hc->rx_li_hist); } static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct tfrc_rx_info rx_info; const void *val; switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) return -EINVAL; rx_info.tfrcrx_x_recv = hc->rx_x_recv; rx_info.tfrcrx_rtt = hc->rx_rtt; rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); len = sizeof(rx_info); val = &rx_info; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen) || copy_to_user(optval, val, len)) return -EFAULT; return 0; } struct ccid_operations ccid3_ops = { .ccid_id = DCCPC_CCID3, .ccid_name = "TCP-Friendly Rate Control", .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), .ccid_hc_tx_init = ccid3_hc_tx_init, .ccid_hc_tx_exit = ccid3_hc_tx_exit, .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock), .ccid_hc_rx_init = ccid3_hc_rx_init, .ccid_hc_rx_exit = ccid3_hc_rx_exit, .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt, .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, }; #ifdef CONFIG_IP_DCCP_CCID3_DEBUG module_param(ccid3_debug, bool, 0644); MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages"); #endif |
31 5 21 2 80 3 23 1 14 11 13 13 13 16 18 2 21 14 7 17 19 19 19 17 17 16 30 30 30 30 37 10 1 1 36 36 37 32 37 6 8 8 19 4 3 15 14 14 14 14 1 14 1 19 5 5 2 1 3 1 3 1 3 3 3 2 2 2 19 17 8 8 2 10 18 5 10 8 8 28 28 3 2 3 70 67 67 67 67 14 139 139 139 139 6 3 2 2 1 2 1 1 1 11 66 120 78 39 38 37 37 37 37 2 35 1 15 97 97 94 93 91 91 90 87 15 82 17 10 3 17 4 1 1 12 1 11 11 3 11 78 75 73 2 75 75 70 69 39 31 6 68 17 67 57 56 56 72 1 63 22 3 22 5 2 4 2 2 4 2 5 117 9 117 9 9 9 9 111 111 111 111 9 9 5 5 2 2 2 2 2 2 2447 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 | /* * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * The code this is based on carried the following copyright notice: * --- * (C) Copyright 2001-2006 * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com * Re-worked by Ben Greear <greearb@candelatech.com> * --- */ #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <linux/if_link.h> #include <linux/if_macvlan.h> #include <linux/hash.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <linux/netpoll.h> #define MACVLAN_HASH_BITS 8 #define MACVLAN_HASH_SIZE (1<<MACVLAN_HASH_BITS) #define MACVLAN_BC_QUEUE_LEN 1000 #define MACVLAN_F_PASSTHRU 1 #define MACVLAN_F_ADDRCHANGE 2 struct macvlan_port { struct net_device *dev; struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; struct list_head vlans; struct sk_buff_head bc_queue; struct work_struct bc_work; u32 flags; int count; struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); unsigned char perm_addr[ETH_ALEN]; }; struct macvlan_source_entry { struct hlist_node hlist; struct macvlan_dev *vlan; unsigned char addr[6+2] __aligned(sizeof(u16)); struct rcu_head rcu; }; struct macvlan_skb_cb { const struct macvlan_dev *src; }; #define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0])) static void macvlan_port_destroy(struct net_device *dev); static inline bool macvlan_passthru(const struct macvlan_port *port) { return port->flags & MACVLAN_F_PASSTHRU; } static inline void macvlan_set_passthru(struct macvlan_port *port) { port->flags |= MACVLAN_F_PASSTHRU; } static inline bool macvlan_addr_change(const struct macvlan_port *port) { return port->flags & MACVLAN_F_ADDRCHANGE; } static inline void macvlan_set_addr_change(struct macvlan_port *port) { port->flags |= MACVLAN_F_ADDRCHANGE; } static inline void macvlan_clear_addr_change(struct macvlan_port *port) { port->flags &= ~MACVLAN_F_ADDRCHANGE; } /* Hash Ethernet address */ static u32 macvlan_eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, MACVLAN_HASH_BITS); } static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static struct macvlan_port *macvlan_port_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } #define macvlan_port_exists(dev) (dev->priv_flags & IFF_MACVLAN_PORT) static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; u32 idx = macvlan_eth_hash(addr); hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } static struct macvlan_source_entry *macvlan_hash_lookup_source( const struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; hlist_for_each_entry_rcu(entry, h, hlist) { if (ether_addr_equal_64bits(entry->addr, addr) && entry->vlan == vlan) return entry; } return NULL; } static int macvlan_hash_add_source(struct macvlan_dev *vlan, const unsigned char *addr) { struct macvlan_port *port = vlan->port; struct macvlan_source_entry *entry; struct hlist_head *h; entry = macvlan_hash_lookup_source(vlan, addr); if (entry) return 0; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; ether_addr_copy(entry->addr, addr); entry->vlan = vlan; h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; hlist_add_head_rcu(&entry->hlist, h); vlan->macaddr_count++; return 0; } static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; u32 idx = macvlan_eth_hash(addr); hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); } static void macvlan_hash_del_source(struct macvlan_source_entry *entry) { hlist_del_rcu(&entry->hlist); kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) { hlist_del_rcu(&vlan->hlist); if (sync) synchronize_rcu(); } static void macvlan_hash_change_addr(struct macvlan_dev *vlan, const unsigned char *addr) { macvlan_hash_del(vlan, true); /* Now that we are unhashed it is safe to change the device * address without confusing packet delivery. */ memcpy(vlan->dev->dev_addr, addr, ETH_ALEN); macvlan_hash_add(vlan); } static bool macvlan_addr_busy(const struct macvlan_port *port, const unsigned char *addr) { /* Test to see if the specified address is * currently in use by the underlying device or * another macvlan. */ if (!macvlan_passthru(port) && !macvlan_addr_change(port) && ether_addr_equal_64bits(port->dev->dev_addr, addr)) return true; if (macvlan_hash_lookup(port, addr)) return true; return false; } static int macvlan_broadcast_one(struct |